meesamraza commited on
Commit
235deb2
·
verified ·
1 Parent(s): 4b84a53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -87
app.py CHANGED
@@ -1,24 +1,15 @@
1
  import os
2
  import logging
3
- import io
4
- import pinecone
5
  from dotenv import load_dotenv
6
  import streamlit as st
7
- from PyPDF2 import PdfReader, errors
8
  from langchain.text_splitter import CharacterTextSplitter
9
- from langchain.embeddings.openai import OpenAIEmbeddings
10
  from langchain.vectorstores import Pinecone
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
- from langchain_groq import ChatGroq
14
-
15
- # Check and import Pinecone
16
- try:
17
- import pinecone
18
- print(f"Pinecone version: {pinecone.__version__}")
19
- except ModuleNotFoundError:
20
- st.error("Pinecone module not found. Please install it using 'pip install pinecone-client'.")
21
- raise
22
 
23
  # Load environment variables
24
  load_dotenv()
@@ -30,34 +21,25 @@ logging.basicConfig(
30
  )
31
 
32
  # Initialize Pinecone
33
- pinecone_api_key = os.getenv("PINECONE_API_KE")
34
- pinecone_env = os.getenv("PINECONE_API_KE")
35
-
36
  if pinecone_api_key and pinecone_env:
37
  pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)
38
- logging.info("Pinecone initialized successfully.")
39
  else:
40
- st.error("Pinecone API key or environment missing.")
41
- raise ValueError("Missing Pinecone API key or environment variables.")
42
 
43
- # Function to extract text from PDFs with error handling
44
  def get_pdf_text(pdf_docs):
45
  text = ""
46
  for pdf in pdf_docs:
47
- try:
48
- pdf_file = io.BytesIO(pdf.read()) # Ensure proper file handling
49
- pdf_reader = PdfReader(pdf_file)
50
- for page in pdf_reader.pages:
51
- extracted_text = page.extract_text()
52
- if extracted_text:
53
- text += extracted_text + "\n"
54
- except errors.PdfReadError:
55
- st.error(f"Error reading {pdf.name}: The file may be corrupted or not a valid PDF.")
56
- except Exception as e:
57
- st.error(f"Unexpected error processing {pdf.name}: {e}")
58
  return text
59
 
60
- # Function to split text into chunks
61
  def get_text_chunks(text):
62
  text_splitter = CharacterTextSplitter(
63
  separator="\n",
@@ -69,39 +51,34 @@ def get_text_chunks(text):
69
 
70
  # Function to create a Pinecone vectorstore
71
  def get_vectorstore(text_chunks):
72
- try:
73
- embeddings = OpenAIEmbeddings()
74
- index_name = os.getenv("PINECONE_INDEX")
75
- if not index_name:
76
- st.error("Pinecone index name is missing. Please set PINECONE_INDEX in the environment.")
77
- return None
78
- # Check if the index exists
79
- if index_name not in pinecone.list_indexes():
80
- st.error(f"Pinecone index '{index_name}' not found. Please create it first.")
81
- return None
82
- vectorstore = Pinecone.from_texts(texts=text_chunks, embedding=embeddings, index_name=index_name)
83
- return vectorstore
84
- except Exception as e:
85
- st.error(f"Error creating Pinecone vectorstore: {e}")
86
- return None
87
 
88
  # Function to set up the conversational retrieval chain
89
  def get_conversation_chain(vectorstore):
90
  try:
91
  groq_api_key = os.getenv("GROQ_API_KEY")
92
- if not groq_api_key:
93
- st.error("Groq API key is missing. Please set GROQ_API_KEY in the environment.")
94
- return None
95
-
96
- llm = ChatGroq(
97
- model_name="mixtral-8x7b-32768",
98
- temperature=0.7,
99
- groq_api_key=groq_api_key
100
- )
 
 
 
 
101
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
102
 
103
  conversation_chain = ConversationalRetrievalChain.from_llm(
104
- llm=llm,
105
  retriever=vectorstore.as_retriever(),
106
  memory=memory
107
  )
@@ -111,59 +88,47 @@ def get_conversation_chain(vectorstore):
111
  except Exception as e:
112
  logging.error(f"Error creating conversation chain: {e}")
113
  st.error("An error occurred while setting up the conversation chain.")
114
- return None
115
 
116
  # Handle user input
117
  def handle_userinput(user_question):
118
  if st.session_state.conversation is not None:
119
  response = st.session_state.conversation({'question': user_question})
120
- if 'chat_history' in response:
121
- st.session_state.chat_history = response['chat_history']
122
- for i, message in enumerate(st.session_state.chat_history):
123
- if hasattr(message, "content"):
124
- role = "User" if i % 2 == 0 else "Bot"
125
- st.write(f"*{role}:* {message.content}")
126
- else:
127
- st.error("No valid response received.")
128
  else:
129
  st.warning("Please process the documents first.")
130
 
131
  # Main function to run the Streamlit app
132
  def main():
133
  load_dotenv()
134
- st.set_page_config(page_title="Chat with multiple PDFs", page_icon="📚")
135
 
136
  if "conversation" not in st.session_state:
137
  st.session_state.conversation = None
138
  if "chat_history" not in st.session_state:
139
  st.session_state.chat_history = None
140
 
141
- st.header("Chat with multiple PDFs 📚")
142
  user_question = st.text_input("Ask a question about your documents:")
143
  if user_question:
144
  handle_userinput(user_question)
145
 
146
  with st.sidebar:
147
  st.subheader("Your documents")
148
- pdf_docs = st.file_uploader(
149
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True, type=["pdf"]
150
- )
151
  if st.button("Process"):
152
- if pdf_docs:
153
- with st.spinner("Processing..."):
154
- raw_text = get_pdf_text(pdf_docs)
155
- if raw_text.strip():
156
- text_chunks = get_text_chunks(raw_text)
157
- vectorstore = get_vectorstore(text_chunks)
158
- if vectorstore:
159
- st.session_state.conversation = get_conversation_chain(vectorstore)
160
- st.success("Processing complete! You can now ask questions.")
161
- else:
162
- st.error("Vectorstore creation failed.")
163
- else:
164
- st.error("No valid text extracted from the PDFs.")
165
- else:
166
- st.warning("Please upload at least one PDF.")
167
 
168
  if __name__ == '__main__':
169
- main()
 
1
  import os
2
  import logging
 
 
3
  from dotenv import load_dotenv
4
  import streamlit as st
5
+ from PyPDF2 import PdfReader
6
  from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import Pinecone
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
+ from groq import Groq
12
+ import pinecone
 
 
 
 
 
 
 
13
 
14
  # Load environment variables
15
  load_dotenv()
 
21
  )
22
 
23
  # Initialize Pinecone
24
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
25
+ pinecone_env = os.getenv("PINECONE_ENV")
 
26
  if pinecone_api_key and pinecone_env:
27
  pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)
 
28
  else:
29
+ logging.error("Pinecone API key or environment is missing.")
 
30
 
31
+ # Function to extract text from PDF files
32
  def get_pdf_text(pdf_docs):
33
  text = ""
34
  for pdf in pdf_docs:
35
+ pdf_reader = PdfReader(pdf)
36
+ for page in pdf_reader.pages:
37
+ extracted_text = page.extract_text()
38
+ if extracted_text:
39
+ text += extracted_text + "\n"
 
 
 
 
 
 
40
  return text
41
 
42
+ # Function to split the extracted text into chunks
43
  def get_text_chunks(text):
44
  text_splitter = CharacterTextSplitter(
45
  separator="\n",
 
51
 
52
  # Function to create a Pinecone vectorstore
53
  def get_vectorstore(text_chunks):
54
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
55
+ index_name = os.getenv("PINECONE_INDEX")
56
+ if not index_name:
57
+ raise ValueError("Pinecone index name is not set in environment variables.")
58
+ vectorstore = Pinecone.from_texts(texts=text_chunks, embedding=embeddings, index_name=index_name)
59
+ return vectorstore
 
 
 
 
 
 
 
 
 
60
 
61
  # Function to set up the conversational retrieval chain
62
  def get_conversation_chain(vectorstore):
63
  try:
64
  groq_api_key = os.getenv("GROQ_API_KEY")
65
+ client = Groq(api_key=groq_api_key)
66
+
67
+ def groq_llm(messages):
68
+ completion = client.chat.completions.create(
69
+ model="llama-3.3-70b-versatile",
70
+ messages=messages,
71
+ temperature=0.7,
72
+ max_completion_tokens=1024,
73
+ top_p=1,
74
+ stream=False
75
+ )
76
+ return completion.choices[0].message.content
77
+
78
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
79
 
80
  conversation_chain = ConversationalRetrievalChain.from_llm(
81
+ llm=groq_llm,
82
  retriever=vectorstore.as_retriever(),
83
  memory=memory
84
  )
 
88
  except Exception as e:
89
  logging.error(f"Error creating conversation chain: {e}")
90
  st.error("An error occurred while setting up the conversation chain.")
 
91
 
92
  # Handle user input
93
  def handle_userinput(user_question):
94
  if st.session_state.conversation is not None:
95
  response = st.session_state.conversation({'question': user_question})
96
+ st.session_state.chat_history = response['chat_history']
97
+
98
+ for i, message in enumerate(st.session_state.chat_history):
99
+ role = "User" if i % 2 == 0 else "Bot"
100
+ st.write(f"*{role}:* {message.content}")
 
 
 
101
  else:
102
  st.warning("Please process the documents first.")
103
 
104
  # Main function to run the Streamlit app
105
  def main():
106
  load_dotenv()
107
+ st.set_page_config(page_title="Chat with PDFs", page_icon=":books:")
108
 
109
  if "conversation" not in st.session_state:
110
  st.session_state.conversation = None
111
  if "chat_history" not in st.session_state:
112
  st.session_state.chat_history = None
113
 
114
+ st.header("Chat with PDFs :books:")
115
  user_question = st.text_input("Ask a question about your documents:")
116
  if user_question:
117
  handle_userinput(user_question)
118
 
119
  with st.sidebar:
120
  st.subheader("Your documents")
121
+ pdf_docs = st.file_uploader("Upload PDFs and click 'Process'", accept_multiple_files=True, type=["pdf"])
 
 
122
  if st.button("Process"):
123
+ with st.spinner("Processing..."):
124
+ raw_text = get_pdf_text(pdf_docs)
125
+ if raw_text.strip():
126
+ text_chunks = get_text_chunks(raw_text)
127
+ vectorstore = get_vectorstore(text_chunks)
128
+ st.session_state.conversation = get_conversation_chain(vectorstore)
129
+ st.success("Processing complete! You can now ask questions.")
130
+ else:
131
+ st.error("No valid text extracted from the PDFs.")
 
 
 
 
 
 
132
 
133
  if __name__ == '__main__':
134
+ main()