import streamlit as st
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

# Load Extractive QA Model (Like ChatPDF)
model_name = "deepset/roberta-base-squad2"
qa_pipeline = pipeline("question-answering", model=model_name)

# Load Sentence Embeddings Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to Extract & Clean PDF Text
def extract_clean_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:  # Only add text if it's not empty
                text += extracted_text + "\n"
    return text.strip()  # Remove extra spaces

# Function to Split Text into Chunks
def split_text(text, chunk_size=500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
    return text_splitter.split_text(text)

# Function to Create FAISS Vector Database
def create_faiss_index(chunks):
    if not chunks:
        return None, None, None  # Avoid errors if text extraction fails
    embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, chunks, embeddings

# Function to Find the Best Matching Chunk
def find_best_chunk(question, index, chunks, embeddings):
    if index is None:
        return "No valid text found in the PDF."
    question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32)
    _, closest_idx = index.search(np.array(question_embedding), 1)
    return chunks[closest_idx[0][0]]

# Function to Extract the Best Answer
def get_answer(question, context):
    response = qa_pipeline(question=question, context=context)
    return response['answer']  # Returns extracted answer (ChatPDF-like behavior)

# Streamlit UI
st.title("Chat with AWS Restart PDF")

# Load & Process PDF
pdf_path = "AWS restart program information.docx.pdf"
pdf_text = extract_clean_text(pdf_path)
chunks = split_text(pdf_text)
index, chunks, embeddings = create_faiss_index(chunks)

if pdf_text:
    st.write("✅ PDF Loaded Successfully!")
else:
    st.write("⚠ No valid text found in the PDF. Please check the document format.")

# User Input
question = st.text_input("Ask a question about AWS Restart program:")

if st.button("Get Answer") and question:
    relevant_chunk = find_best_chunk(question, index, chunks, embeddings)
    response = get_answer(question, relevant_chunk)
    st.write("Answer:", response)