Spaces:

SujathaL
/

AWS_Restart_Program_Chatbot

Sleeping

App Files Files Community

SujathaL commited on Mar 3, 2025

Commit

23be0f8

verified ·

1 Parent(s): 8a1d8cd

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -39

app.py CHANGED Viewed

@@ -1,57 +1,49 @@
 import streamlit as st
-from transformers import pipeline
 import pdfplumber
-import re
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from sentence_transformers import SentenceTransformer, util
-# Load Hugging Face Question Answering model
 qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
-# Load Embeddings Model for Better Context Matching
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-# Function to Extract and Clean Text from PDF
 def extract_clean_text(pdf_path):
     text = ""
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
             text += page.extract_text() + "\n"
-    # Remove extra spaces and newlines
-    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
-    text = text.replace(" .", ".")  # Fix misplaced spaces before periods
-    # Add section headers where possible
-    text = re.sub(r'(?<=\n)([A-Z][a-z]+.*?):', r'\n\n## \1\n', text)  # Convert labels into headings
-    return text
 # Function to Split Text into Chunks
-def split_text(text):
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-    chunks = text_splitter.split_text(text)
-    return chunks
-# Function to Find the Most Relevant Chunk Using Embeddings
-def find_best_chunk(question, chunks):
-    question_embedding = embedding_model.encode(question, convert_to_tensor=True)
-    chunk_embeddings = [embedding_model.encode(chunk, convert_to_tensor=True) for chunk in chunks]
-    # Compute similarity between question and each chunk
-    similarities = [util.pytorch_cos_sim(question_embedding, chunk_emb).item() for chunk_emb in chunk_embeddings]
-    # Find the most relevant chunk
-    best_chunk_index = similarities.index(max(similarities))
-    return chunks[best_chunk_index]
 # Streamlit UI
-st.title("Chat with AWS Restart PDF")
-# Load and Process PDF
-pdf_path = "AWS restart program information.docx.pdf"  # Change to your uploaded file
-pdf_text = extract_clean_text(pdf_path)  # Extract & clean text
-chunks = split_text(pdf_text)  # Split into chunks
 st.write("✅ PDF Loaded Successfully!")
@@ -59,6 +51,6 @@ st.write("✅ PDF Loaded Successfully!")
 question = st.text_input("Ask a question about AWS Restart program:")
 if st.button("Get Answer") and question:
-    relevant_chunk = find_best_chunk(question, chunks)  # Retrieve the best chunk
-    response = qa_pipeline(question=question, context=relevant_chunk)  # Ask the model
     st.write("Answer:", response['answer'])

 import streamlit as st
 import pdfplumber
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Load Models
 qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Function to Extract & Clean PDF Text
 def extract_clean_text(pdf_path):
     text = ""
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
             text += page.extract_text() + "\n"
+    return text.replace("\n", " ")
 # Function to Split Text into Chunks
+def split_text(text, chunk_size=500):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
+    return text_splitter.split_text(text)
+# Function to Create FAISS Vector Database
+def create_faiss_index(chunks):
+    embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks])
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    return index, chunks, embeddings
+# Function to Find Best Matching Chunk
+def find_best_chunk(question, index, chunks, embeddings):
+    question_embedding = embedding_model.encode(question).reshape(1, -1)
+    _, closest_idx = index.search(np.array(question_embedding), 1)
+    return chunks[closest_idx[0][0]]
 # Streamlit UI
+st.title("Chat with AWS Restart PDF (Like ChatPDF)")
+# Load & Process PDF
+pdf_path = "AWS restart program information.docx.pdf"
+pdf_text = extract_clean_text(pdf_path)
+chunks = split_text(pdf_text)
+index, chunks, embeddings = create_faiss_index(chunks)
 st.write("✅ PDF Loaded Successfully!")
 question = st.text_input("Ask a question about AWS Restart program:")
 if st.button("Get Answer") and question:
+    relevant_chunk = find_best_chunk(question, index, chunks, embeddings)
+    response = qa_pipeline(question=question, context=relevant_chunk)
     st.write("Answer:", response['answer'])