File size: 2,708 Bytes
9e5a092 5752742 23be0f8 a1e9850 4c3d6b3 9e5a092 4c3d6b3 2a2dd2f 5f0b01f 6c5b356 23be0f8 5752742 5f0b01f 9e5a092 6c5b356 23be0f8 5f0b01f 23be0f8 5f0b01f 23be0f8 5f0b01f 23be0f8 a1e9850 4c3d6b3 2a2dd2f 4c3d6b3 2a2dd2f 9e5a092 ce554ac 3a62d0c 23be0f8 a1e9850 5f0b01f 9e5a092 3a62d0c 9e5a092 23be0f8 2a2dd2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import streamlit as st
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline
# Load Extractive QA Model (Like ChatPDF)
model_name = "deepset/roberta-base-squad2"
qa_pipeline = pipeline("question-answering", model=model_name)
# Load Sentence Embeddings Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Function to Extract & Clean PDF Text
def extract_clean_text(pdf_path):
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
extracted_text = page.extract_text()
if extracted_text: # Only add text if it's not empty
text += extracted_text + "\n"
return text.strip() # Remove extra spaces
# Function to Split Text into Chunks
def split_text(text, chunk_size=500):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
return text_splitter.split_text(text)
# Function to Create FAISS Vector Database
def create_faiss_index(chunks):
if not chunks:
return None, None, None # Avoid errors if text extraction fails
embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
return index, chunks, embeddings
# Function to Find the Best Matching Chunk
def find_best_chunk(question, index, chunks, embeddings):
if index is None:
return "No valid text found in the PDF."
question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32)
_, closest_idx = index.search(np.array(question_embedding), 1)
return chunks[closest_idx[0][0]]
# Function to Extract the Best Answer
def get_answer(question, context):
response = qa_pipeline(question=question, context=context)
return response['answer'] # Returns extracted answer (ChatPDF-like behavior)
# Streamlit UI
st.title("Chat with AWS Restart PDF")
# Load & Process PDF
pdf_path = "AWS restart program information.docx.pdf"
pdf_text = extract_clean_text(pdf_path)
chunks = split_text(pdf_text)
index, chunks, embeddings = create_faiss_index(chunks)
if pdf_text:
st.write("✅ PDF Loaded Successfully!")
else:
st.write("⚠ No valid text found in the PDF. Please check the document format.")
# User Input
question = st.text_input("Ask a question about AWS Restart program:")
if st.button("Get Answer") and question:
relevant_chunk = find_best_chunk(question, index, chunks, embeddings)
response = get_answer(question, relevant_chunk)
st.write("Answer:", response)
|