SujathaL's picture
Update app.py
ce554ac verified
import streamlit as st
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline
# Load Extractive QA Model (Like ChatPDF)
model_name = "deepset/roberta-base-squad2"
qa_pipeline = pipeline("question-answering", model=model_name)
# Load Sentence Embeddings Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Function to Extract & Clean PDF Text
def extract_clean_text(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
extracted_text = page.extract_text()
if extracted_text: # Only add text if it's not empty
text += extracted_text + "\n"
return text.strip() # Remove extra spaces
# Function to Split Text into Chunks
def split_text(text, chunk_size=500):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
return text_splitter.split_text(text)
# Function to Create FAISS Vector Database
def create_faiss_index(chunks):
if not chunks:
return None, None, None # Avoid errors if text extraction fails
embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return index, chunks, embeddings
# Function to Find the Best Matching Chunk
def find_best_chunk(question, index, chunks, embeddings):
if index is None:
return "No valid text found in the PDF."
question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32)
_, closest_idx = index.search(np.array(question_embedding), 1)
return chunks[closest_idx[0][0]]
# Function to Extract the Best Answer
def get_answer(question, context):
response = qa_pipeline(question=question, context=context)
return response['answer'] # Returns extracted answer (ChatPDF-like behavior)
# Streamlit UI
st.title("Chat with AWS Restart PDF")
# Load & Process PDF
pdf_path = "AWS restart program information.docx.pdf"
pdf_text = extract_clean_text(pdf_path)
chunks = split_text(pdf_text)
index, chunks, embeddings = create_faiss_index(chunks)
if pdf_text:
st.write("✅ PDF Loaded Successfully!")
else:
st.write("⚠ No valid text found in the PDF. Please check the document format.")
# User Input
question = st.text_input("Ask a question about AWS Restart program:")
if st.button("Get Answer") and question:
relevant_chunk = find_best_chunk(question, index, chunks, embeddings)
response = get_answer(question, relevant_chunk)
st.write("Answer:", response)