| import streamlit as st |
| import pdfplumber |
| import faiss |
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from transformers import pipeline |
|
|
| |
| model_name = "deepset/roberta-base-squad2" |
| qa_pipeline = pipeline("question-answering", model=model_name) |
|
|
| |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
| |
| def extract_clean_text(pdf_path): |
| text = "" |
| with pdfplumber.open(pdf_path) as pdf: |
| for page in pdf.pages: |
| extracted_text = page.extract_text() |
| if extracted_text: |
| text += extracted_text + "\n" |
| return text.strip() |
|
|
| |
| def split_text(text, chunk_size=500): |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50) |
| return text_splitter.split_text(text) |
|
|
| |
| def create_faiss_index(chunks): |
| if not chunks: |
| return None, None, None |
| embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32) |
| index = faiss.IndexFlatL2(embeddings.shape[1]) |
| index.add(embeddings) |
| return index, chunks, embeddings |
|
|
| |
| def find_best_chunk(question, index, chunks, embeddings): |
| if index is None: |
| return "No valid text found in the PDF." |
| question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32) |
| _, closest_idx = index.search(np.array(question_embedding), 1) |
| return chunks[closest_idx[0][0]] |
|
|
| |
| def get_answer(question, context): |
| response = qa_pipeline(question=question, context=context) |
| return response['answer'] |
|
|
| |
| st.title("Chat with AWS Restart PDF") |
|
|
| |
| pdf_path = "AWS restart program information.docx.pdf" |
| pdf_text = extract_clean_text(pdf_path) |
| chunks = split_text(pdf_text) |
| index, chunks, embeddings = create_faiss_index(chunks) |
|
|
| if pdf_text: |
| st.write("✅ PDF Loaded Successfully!") |
| else: |
| st.write("⚠ No valid text found in the PDF. Please check the document format.") |
|
|
| |
| question = st.text_input("Ask a question about AWS Restart program:") |
|
|
| if st.button("Get Answer") and question: |
| relevant_chunk = find_best_chunk(question, index, chunks, embeddings) |
| response = get_answer(question, relevant_chunk) |
| st.write("Answer:", response) |
|
|