| import streamlit as st |
| from transformers import pipeline |
| import pdfplumber |
| import re |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from sentence_transformers import SentenceTransformer, util |
|
|
| |
| qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") |
|
|
| |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
| |
| def extract_clean_text(pdf_path): |
| text = "" |
| with pdfplumber.open(pdf_path) as pdf: |
| for page in pdf.pages: |
| text += page.extract_text() + "\n" |
|
|
| |
| text = re.sub(r'\s+', ' ', text) |
| text = text.replace(" .", ".") |
|
|
| |
| text = re.sub(r'(?<=\n)([A-Z][a-z]+.*?):', r'\n\n## \1\n', text) |
|
|
| return text |
|
|
| |
| def split_text(text): |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
| chunks = text_splitter.split_text(text) |
| return chunks |
|
|
| |
| def find_best_chunk(question, chunks): |
| question_embedding = embedding_model.encode(question, convert_to_tensor=True) |
| chunk_embeddings = [embedding_model.encode(chunk, convert_to_tensor=True) for chunk in chunks] |
|
|
| |
| similarities = [util.pytorch_cos_sim(question_embedding, chunk_emb).item() for chunk_emb in chunk_embeddings] |
|
|
| |
| best_chunk_index = similarities.index(max(similarities)) |
| return chunks[best_chunk_index] |
|
|
| |
| st.title("Chat with AWS Restart PDF") |
|
|
| |
| pdf_path = "AWS restart program information.docx.pdf" |
| pdf_text = extract_clean_text(pdf_path) |
| chunks = split_text(pdf_text) |
|
|
| st.write("✅ PDF Loaded Successfully!") |
|
|
| |
| question = st.text_input("Ask a question about AWS Restart program:") |
|
|
| if st.button("Get Answer") and question: |
| relevant_chunk = find_best_chunk(question, chunks) |
| response = qa_pipeline(question=question, context=relevant_chunk) |
| st.write("Answer:", response['answer']) |
|
|