yousifalishah commited on
Commit
03e75d1
·
verified ·
1 Parent(s): 65f1b04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -35
app.py CHANGED
@@ -1,13 +1,17 @@
 
1
  import logging
 
2
  import streamlit as st
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
 
5
  from langchain.vectorstores import FAISS
6
  from langchain.memory import ConversationBufferMemory
7
  from langchain.chains import ConversationalRetrievalChain
8
- from transformers import AutoTokenizer, AutoModel
9
- import torch
10
- import numpy as np
 
11
 
12
  # Set up logging
13
  logging.basicConfig(
@@ -15,13 +19,6 @@ logging.basicConfig(
15
  format='%(asctime)s - %(levelname)s - %(message)s'
16
  )
17
 
18
- # Load the Hugging Face model and tokenizer (local model from Hugging Face)
19
- def load_huggingface_model():
20
- model_name = "bert-base-uncased" # You can replace this with another model as needed
21
- tokenizer = AutoTokenizer.from_pretrained(model_name)
22
- model = AutoModel.from_pretrained(model_name)
23
- return tokenizer, model
24
-
25
  # Function to extract text from PDF files
26
  def get_pdf_text(pdf_docs):
27
  text = ""
@@ -42,36 +39,22 @@ def get_text_chunks(text):
42
  chunks = text_splitter.split_text(text)
43
  return chunks
44
 
45
- # Function to create embeddings using Hugging Face and return embeddings
46
- def create_embeddings(text_chunks, tokenizer, model):
47
- embeddings = []
48
- for chunk in text_chunks:
49
- inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
50
- with torch.no_grad():
51
- outputs = model(**inputs)
52
- embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
53
-
54
- # Convert the list of embeddings into a numpy array
55
- return np.array(embeddings)
56
-
57
  # Function to create a FAISS vectorstore
58
- def get_vectorstore(text_chunks, tokenizer, model):
59
- embeddings = create_embeddings(text_chunks, tokenizer, model)
60
- vectorstore = FAISS.from_embeddings(embeddings)
 
61
  return vectorstore
62
 
63
  # Function to set up the conversational retrieval chain
64
  def get_conversation_chain(vectorstore):
65
  try:
66
- llm = "Your LLM model here, e.g., Groq or another Hugging Face model"
67
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
68
-
69
  conversation_chain = ConversationalRetrievalChain.from_llm(
70
- llm=llm,
71
  retriever=vectorstore.as_retriever(),
72
- memory=memory
73
  )
74
-
75
  logging.info("Conversation chain created successfully.")
76
  return conversation_chain
77
  except Exception as e:
@@ -94,9 +77,7 @@ def handle_userinput(user_question):
94
 
95
  # Main function to run the Streamlit app
96
  def main():
97
- # Load Hugging Face model and tokenizer
98
- tokenizer, model = load_huggingface_model()
99
-
100
  st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
101
 
102
  if "conversation" not in st.session_state:
@@ -118,7 +99,7 @@ def main():
118
  with st.spinner("Processing..."):
119
  raw_text = get_pdf_text(pdf_docs)
120
  text_chunks = get_text_chunks(raw_text)
121
- vectorstore = get_vectorstore(text_chunks, tokenizer, model)
122
  st.session_state.conversation = get_conversation_chain(vectorstore)
123
 
124
  if __name__ == '__main__':
 
1
+ import os
2
  import logging
3
+ from dotenv import load_dotenv
4
  import streamlit as st
5
  from PyPDF2 import PdfReader
6
  from langchain.text_splitter import CharacterTextSplitter
7
+ from sentence_transformers import SentenceTransformer
8
  from langchain.vectorstores import FAISS
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
+ from groq import Groq
12
+
13
+ # Load environment variables
14
+ load_dotenv()
15
 
16
  # Set up logging
17
  logging.basicConfig(
 
19
  format='%(asctime)s - %(levelname)s - %(message)s'
20
  )
21
 
 
 
 
 
 
 
 
22
  # Function to extract text from PDF files
23
  def get_pdf_text(pdf_docs):
24
  text = ""
 
39
  chunks = text_splitter.split_text(text)
40
  return chunks
41
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Function to create a FAISS vectorstore
43
+ def get_vectorstore(text_chunks):
44
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
45
+ embeddings = model.encode(text_chunks, convert_to_tensor=True)
46
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
47
  return vectorstore
48
 
49
  # Function to set up the conversational retrieval chain
50
  def get_conversation_chain(vectorstore):
51
  try:
52
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 
 
53
  conversation_chain = ConversationalRetrievalChain.from_llm(
54
+ llm=client.chat.completions.create(model="llama-3.3-70b-versatile", temperature=0.5),
55
  retriever=vectorstore.as_retriever(),
56
+ memory=ConversationBufferMemory(memory_key='chat_history', return_messages=True)
57
  )
 
58
  logging.info("Conversation chain created successfully.")
59
  return conversation_chain
60
  except Exception as e:
 
77
 
78
  # Main function to run the Streamlit app
79
  def main():
80
+ load_dotenv()
 
 
81
  st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
82
 
83
  if "conversation" not in st.session_state:
 
99
  with st.spinner("Processing..."):
100
  raw_text = get_pdf_text(pdf_docs)
101
  text_chunks = get_text_chunks(raw_text)
102
+ vectorstore = get_vectorstore(text_chunks)
103
  st.session_state.conversation = get_conversation_chain(vectorstore)
104
 
105
  if __name__ == '__main__':