import streamlit as st import pdfplumber import faiss import numpy as np from sentence_transformers import SentenceTransformer from langchain.text_splitter import RecursiveCharacterTextSplitter from transformers import pipeline # Load Extractive QA Model (Like ChatPDF) model_name = "deepset/roberta-base-squad2" qa_pipeline = pipeline("question-answering", model=model_name) # Load Sentence Embeddings Model embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # Function to Extract & Clean PDF Text def extract_clean_text(pdf_path): text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: extracted_text = page.extract_text() if extracted_text: # Only add text if it's not empty text += extracted_text + "\n" return text.strip() # Remove extra spaces # Function to Split Text into Chunks def split_text(text, chunk_size=500): text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50) return text_splitter.split_text(text) # Function to Create FAISS Vector Database def create_faiss_index(chunks): if not chunks: return None, None, None # Avoid errors if text extraction fails embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) return index, chunks, embeddings # Function to Find the Best Matching Chunk def find_best_chunk(question, index, chunks, embeddings): if index is None: return "No valid text found in the PDF." question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32) _, closest_idx = index.search(np.array(question_embedding), 1) return chunks[closest_idx[0][0]] # Function to Extract the Best Answer def get_answer(question, context): response = qa_pipeline(question=question, context=context) return response['answer'] # Returns extracted answer (ChatPDF-like behavior) # Streamlit UI st.title("Chat with AWS Restart PDF") # Load & Process PDF pdf_path = "AWS restart program information.docx.pdf" pdf_text = extract_clean_text(pdf_path) chunks = split_text(pdf_text) index, chunks, embeddings = create_faiss_index(chunks) if pdf_text: st.write("✅ PDF Loaded Successfully!") else: st.write("⚠ No valid text found in the PDF. Please check the document format.") # User Input question = st.text_input("Ask a question about AWS Restart program:") if st.button("Get Answer") and question: relevant_chunk = find_best_chunk(question, index, chunks, embeddings) response = get_answer(question, relevant_chunk) st.write("Answer:", response)