SujathaL's picture
Update app.py
5752742 verified
raw
history blame
2.49 kB
import streamlit as st
from transformers import pipeline
import pdfplumber
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
# Load Hugging Face Question Answering model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
# Load Embeddings Model for Better Context Matching
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Function to Extract and Clean Text from PDF
def extract_clean_text(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
# Remove extra spaces and newlines
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines with a single space
text = text.replace(" .", ".") # Fix misplaced spaces before periods
# Add section headers where possible
text = re.sub(r'(?<=\n)([A-Z][a-z]+.*?):', r'\n\n## \1\n', text) # Convert labels into headings
return text
# Function to Split Text into Chunks
def split_text(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(text)
return chunks
# Function to Find the Most Relevant Chunk Using Embeddings
def find_best_chunk(question, chunks):
question_embedding = embedding_model.encode(question, convert_to_tensor=True)
chunk_embeddings = [embedding_model.encode(chunk, convert_to_tensor=True) for chunk in chunks]
# Compute similarity between question and each chunk
similarities = [util.pytorch_cos_sim(question_embedding, chunk_emb).item() for chunk_emb in chunk_embeddings]
# Find the most relevant chunk
best_chunk_index = similarities.index(max(similarities))
return chunks[best_chunk_index]
# Streamlit UI
st.title("Chat with AWS Restart PDF")
# Load and Process PDF
pdf_path = "AWS restart program information.docx.pdf" # Change to your uploaded file
pdf_text = extract_clean_text(pdf_path) # Extract & clean text
chunks = split_text(pdf_text) # Split into chunks
st.write("✅ PDF Loaded Successfully!")
# User Input
question = st.text_input("Ask a question about AWS Restart program:")
if st.button("Get Answer") and question:
relevant_chunk = find_best_chunk(question, chunks) # Retrieve the best chunk
response = qa_pipeline(question=question, context=relevant_chunk) # Ask the model
st.write("Answer:", response['answer'])