Spaces:

SujathaL
/

AWS_Restart_Program_Chatbot

Sleeping

App Files Files Community

SujathaL commited on Mar 3, 2025

Commit

5f0b01f

verified ·

1 Parent(s): 28341e6

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -16

app.py CHANGED Viewed

@@ -2,19 +2,17 @@ import streamlit as st
 import pdfplumber
 import faiss
 import numpy as np
 from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-# Load Mistral-7B Model for Generative Answers
-model_name = "google/flan-t5-base"  # Smallest alternative, works on free tier
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
-# Load Sentence Embeddings Model for Better Context Matching
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Function to Extract & Clean PDF Text
@@ -22,8 +20,10 @@ def extract_clean_text(pdf_path):
     text = ""
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
-            text += page.extract_text() + "\n"
-    return text.replace("\n", " ")
 # Function to Split Text into Chunks
 def split_text(text, chunk_size=500):
@@ -32,22 +32,30 @@ def split_text(text, chunk_size=500):
 # Function to Create FAISS Vector Database
 def create_faiss_index(chunks):
-    embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks])
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     return index, chunks, embeddings
-# Function to Find Best Matching Chunk
 def find_best_chunk(question, index, chunks, embeddings):
-    question_embedding = embedding_model.encode(question).reshape(1, -1)
     _, closest_idx = index.search(np.array(question_embedding), 1)
     return chunks[closest_idx[0][0]]
 # Function to Generate a Long, Detailed Answer
 def get_answer(question, context):
     input_text = f"Question: {question}\nContext: {context}\nAnswer:"
-    inputs = tokenizer(input_text, return_tensors="pt")
-    output = model.generate(**inputs, max_length=300, temperature=0.7)
     return tokenizer.decode(output[0], skip_special_tokens=True)
 # Streamlit UI
@@ -59,7 +67,10 @@ pdf_text = extract_clean_text(pdf_path)
 chunks = split_text(pdf_text)
 index, chunks, embeddings = create_faiss_index(chunks)
-st.write("✅ PDF Loaded Successfully!")
 # User Input
 question = st.text_input("Ask a question about AWS Restart program:")

 import pdfplumber
 import faiss
 import numpy as np
+import torch
 from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+# Load Flan-T5 Model for Detailed Answers
+model_name = "google/flan-t5-base"  # Small model that works in Hugging Face Spaces
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Load Sentence Embeddings Model
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Function to Extract & Clean PDF Text
     text = ""
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
+            extracted_text = page.extract_text()
+            if extracted_text:  # Only add text if it's not empty
+                text += extracted_text + "\n"
+    return text.strip()  # Remove extra spaces
 # Function to Split Text into Chunks
 def split_text(text, chunk_size=500):
 # Function to Create FAISS Vector Database
 def create_faiss_index(chunks):
+    if not chunks:
+        return None, None, None  # Avoid errors if text extraction fails
+    embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32)
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
     return index, chunks, embeddings
+# Function to Find the Best Matching Chunk
 def find_best_chunk(question, index, chunks, embeddings):
+    if index is None:
+        return "No valid text found in the PDF."
+    question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32)
     _, closest_idx = index.search(np.array(question_embedding), 1)
     return chunks[closest_idx[0][0]]
 # Function to Generate a Long, Detailed Answer
 def get_answer(question, context):
     input_text = f"Question: {question}\nContext: {context}\nAnswer:"
+    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    # Generate response
+    with torch.no_grad():
+        output = model.generate(**inputs, max_length=300, temperature=0.7)
     return tokenizer.decode(output[0], skip_special_tokens=True)
 # Streamlit UI
 chunks = split_text(pdf_text)
 index, chunks, embeddings = create_faiss_index(chunks)
+if pdf_text:
+    st.write("✅ PDF Loaded Successfully!")
+else:
+    st.write("⚠ No valid text found in the PDF. Please check the document format.")
 # User Input
 question = st.text_input("Ask a question about AWS Restart program:")