|
|
import streamlit as st |
|
|
import pdfplumber |
|
|
import faiss |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
model_name = "deepset/roberta-base-squad2" |
|
|
qa_pipeline = pipeline("question-answering", model=model_name) |
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
def extract_clean_text(pdf_path): |
|
|
text = "" |
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
|
for page in pdf.pages: |
|
|
extracted_text = page.extract_text() |
|
|
if extracted_text: |
|
|
text += extracted_text + "\n" |
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def split_text(text, chunk_size=500): |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50) |
|
|
return text_splitter.split_text(text) |
|
|
|
|
|
|
|
|
def create_faiss_index(chunks): |
|
|
if not chunks: |
|
|
return None, None, None |
|
|
embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32) |
|
|
index = faiss.IndexFlatL2(embeddings.shape[1]) |
|
|
index.add(embeddings) |
|
|
return index, chunks, embeddings |
|
|
|
|
|
|
|
|
def find_best_chunk(question, index, chunks, embeddings): |
|
|
if index is None: |
|
|
return "No valid text found in the PDF." |
|
|
question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32) |
|
|
_, closest_idx = index.search(np.array(question_embedding), 1) |
|
|
return chunks[closest_idx[0][0]] |
|
|
|
|
|
|
|
|
def get_answer(question, context): |
|
|
response = qa_pipeline(question=question, context=context) |
|
|
return response['answer'] |
|
|
|
|
|
|
|
|
st.title("Chat with AWS Restart PDF") |
|
|
|
|
|
|
|
|
pdf_path = "AWS restart program information.docx.pdf" |
|
|
pdf_text = extract_clean_text(pdf_path) |
|
|
chunks = split_text(pdf_text) |
|
|
index, chunks, embeddings = create_faiss_index(chunks) |
|
|
|
|
|
if pdf_text: |
|
|
st.write("✅ PDF Loaded Successfully!") |
|
|
else: |
|
|
st.write("⚠ No valid text found in the PDF. Please check the document format.") |
|
|
|
|
|
|
|
|
question = st.text_input("Ask a question about AWS Restart program:") |
|
|
|
|
|
if st.button("Get Answer") and question: |
|
|
relevant_chunk = find_best_chunk(question, index, chunks, embeddings) |
|
|
response = get_answer(question, relevant_chunk) |
|
|
st.write("Answer:", response) |
|
|
|