Spaces:

shallou
/

pdfchatbot

Sleeping

App Files Files Community

shallou commited on Aug 14, 2024

Commit

131ff8a

verified ·

1 Parent(s): 297e092

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -21

app.py CHANGED Viewed

@@ -2,15 +2,48 @@ from dotenv import load_dotenv
 import streamlit as st
 import pickle
 from PyPDF2 import PdfReader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import FAISS
-from transformers import pipeline
 import os
 # Load environment variables from .env file
 load_dotenv()
 def main():
     st.header("LLM-powered PDF Chatbot 💬")
@@ -19,43 +52,41 @@ def main():
     if pdf is not None:
         pdf_reader = PdfReader(pdf)
         text = ""
         for page in pdf_reader.pages:
             text += page.extract_text()
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=200,
-            length_function=len
-        )
-        chunks = text_splitter.split_text(text=text)
-        # Process and store embeddings
         store_name = pdf.name[:-4]
         st.write(f'{store_name}')
         if os.path.exists(f"{store_name}.pkl"):
             with open(f"{store_name}.pkl", "rb") as f:
-                VectorStore = pickle.load(f)
             st.write('Embeddings Loaded from the Disk')
         else:
-            embeddings = HuggingFaceEmbeddings()
-            VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
             with open(f"{store_name}.pkl", "wb") as f:
-                pickle.dump(VectorStore, f)
         # Accept user questions/query
         query = st.text_input("Ask questions about your PDF file:")
         if query:
-            docs = VectorStore.similarity_search(query=query, k=3)
             # Use Hugging Face pipeline for question answering
-            model_name = "distilbert-base-uncased-distilled-squad"  # Example model
-            qa_pipeline = pipeline("question-answering", model=model_name)
-            context = " ".join([doc.page_content for doc in docs])
-            result = qa_pipeline(question=query, context=context)
             st.write(result['answer'])
 if __name__ == '__main__':

 import streamlit as st
 import pickle
 from PyPDF2 import PdfReader
+from transformers import pipeline, AutoTokenizer, AutoModel
 import os
+import torch
+import numpy as np
 # Load environment variables from .env file
 load_dotenv()
+# Define a function to manually chunk text
+def chunk_text(text, chunk_size=1000, chunk_overlap=200):
+    chunks = []
+    i = 0
+    while i < len(text):
+        # Ensure chunk size and overlap are handled properly
+        chunks.append(text[i:i + chunk_size])
+        i += chunk_size - chunk_overlap
+    return chunks
+# Function to generate embeddings using transformers
+def generate_embeddings(text_chunks, model_name='sentence-transformers/all-MiniLM-L6-v2'):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    embeddings = []
+    for text in text_chunks:
+        # Tokenize the text and generate embeddings
+        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Mean pooling on the last hidden state
+        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
+    return embeddings
+# Function to find the most relevant chunk based on the cosine similarity
+def find_best_chunk(query_embedding, text_embeddings):
+    cosine_similarities = np.dot(text_embeddings, query_embedding) / (
+        np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding)
+    )
+    best_index = np.argmax(cosine_similarities)
+    return best_index, cosine_similarities[best_index]
+# Main Streamlit app function
 def main():
     st.header("LLM-powered PDF Chatbot 💬")
     if pdf is not None:
         pdf_reader = PdfReader(pdf)
         text = ""
         for page in pdf_reader.pages:
             text += page.extract_text()
+        # Split text into chunks
+        chunks = chunk_text(text)
+        # Generate embeddings for the chunks
         store_name = pdf.name[:-4]
         st.write(f'{store_name}')
         if os.path.exists(f"{store_name}.pkl"):
             with open(f"{store_name}.pkl", "rb") as f:
+                text_embeddings = pickle.load(f)
             st.write('Embeddings Loaded from the Disk')
         else:
+            text_embeddings = generate_embeddings(chunks)
             with open(f"{store_name}.pkl", "wb") as f:
+                pickle.dump(text_embeddings, f)
         # Accept user questions/query
         query = st.text_input("Ask questions about your PDF file:")
         if query:
+            # Generate embeddings for the query
+            query_embedding = generate_embeddings([query])[0]
+            # Find the best chunk for the query
+            best_index, similarity = find_best_chunk(query_embedding, text_embeddings)
+            best_chunk = chunks[best_index]
             # Use Hugging Face pipeline for question answering
+            qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+            result = qa_pipeline(question=query, context=best_chunk)
             st.write(result['answer'])
 if __name__ == '__main__':