Spaces:

yousifalishah
/

chatWithMultiplePDF1

Sleeping

App Files Files Community

yousifalishah commited on Feb 26, 2025

Commit

03e75d1

verified ·

1 Parent(s): 65f1b04

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -35

app.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import logging
 import streamlit as st
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
-from transformers import AutoTokenizer, AutoModel
-import torch
-import numpy as np
 # Set up logging
 logging.basicConfig(
@@ -15,13 +19,6 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
-# Load the Hugging Face model and tokenizer (local model from Hugging Face)
-def load_huggingface_model():
-    model_name = "bert-base-uncased"  # You can replace this with another model as needed
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModel.from_pretrained(model_name)
-    return tokenizer, model
 # Function to extract text from PDF files
 def get_pdf_text(pdf_docs):
     text = ""
@@ -42,36 +39,22 @@ def get_text_chunks(text):
     chunks = text_splitter.split_text(text)
     return chunks
-# Function to create embeddings using Hugging Face and return embeddings
-def create_embeddings(text_chunks, tokenizer, model):
-    embeddings = []
-    for chunk in text_chunks:
-        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
-        with torch.no_grad():
-            outputs = model(**inputs)
-            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
-    # Convert the list of embeddings into a numpy array
-    return np.array(embeddings)
 # Function to create a FAISS vectorstore
-def get_vectorstore(text_chunks, tokenizer, model):
-    embeddings = create_embeddings(text_chunks, tokenizer, model)
-    vectorstore = FAISS.from_embeddings(embeddings)
     return vectorstore
 # Function to set up the conversational retrieval chain
 def get_conversation_chain(vectorstore):
     try:
-        llm = "Your LLM model here, e.g., Groq or another Hugging Face model"
-        memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
         conversation_chain = ConversationalRetrievalChain.from_llm(
-            llm=llm,
             retriever=vectorstore.as_retriever(),
-            memory=memory
         )
         logging.info("Conversation chain created successfully.")
         return conversation_chain
     except Exception as e:
@@ -94,9 +77,7 @@ def handle_userinput(user_question):
 # Main function to run the Streamlit app
 def main():
-    # Load Hugging Face model and tokenizer
-    tokenizer, model = load_huggingface_model()
     st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
     if "conversation" not in st.session_state:
@@ -118,7 +99,7 @@ def main():
             with st.spinner("Processing..."):
                 raw_text = get_pdf_text(pdf_docs)
                 text_chunks = get_text_chunks(raw_text)
-                vectorstore = get_vectorstore(text_chunks, tokenizer, model)
                 st.session_state.conversation = get_conversation_chain(vectorstore)
 if __name__ == '__main__':

+import os
 import logging
+from dotenv import load_dotenv
 import streamlit as st
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
+from sentence_transformers import SentenceTransformer
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
+from groq import Groq
+# Load environment variables
+load_dotenv()
 # Set up logging
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 # Function to extract text from PDF files
 def get_pdf_text(pdf_docs):
     text = ""
     chunks = text_splitter.split_text(text)
     return chunks
 # Function to create a FAISS vectorstore
+def get_vectorstore(text_chunks):
+    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+    embeddings = model.encode(text_chunks, convert_to_tensor=True)
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 # Function to set up the conversational retrieval chain
 def get_conversation_chain(vectorstore):
     try:
+        client = Groq(api_key=os.getenv("GROQ_API_KEY"))
         conversation_chain = ConversationalRetrievalChain.from_llm(
+            llm=client.chat.completions.create(model="llama-3.3-70b-versatile", temperature=0.5),
             retriever=vectorstore.as_retriever(),
+            memory=ConversationBufferMemory(memory_key='chat_history', return_messages=True)
         )
         logging.info("Conversation chain created successfully.")
         return conversation_chain
     except Exception as e:
 # Main function to run the Streamlit app
 def main():
+    load_dotenv()
     st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
     if "conversation" not in st.session_state:
             with st.spinner("Processing..."):
                 raw_text = get_pdf_text(pdf_docs)
                 text_chunks = get_text_chunks(raw_text)
+                vectorstore = get_vectorstore(text_chunks)
                 st.session_state.conversation = get_conversation_chain(vectorstore)
 if __name__ == '__main__':