1MR commited on
Commit
22a9f10
·
verified ·
1 Parent(s): b882d51

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -209
app.py CHANGED
@@ -11,6 +11,7 @@ import os
11
  from langchain_community.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
12
  from htmlTemplates import css, bot_template, user_template
13
 
 
14
  def get_pdf_text(pdf_docs):
15
  temp_dir = tempfile.TemporaryDirectory()
16
  temp_filepath = os.path.join(temp_dir.name, pdf_docs.name)
@@ -20,6 +21,7 @@ def get_pdf_text(pdf_docs):
20
  pdf_doc = pdf_loader.load()
21
  return pdf_doc
22
 
 
23
  def get_text_file(text_docs):
24
  temp_dir = tempfile.TemporaryDirectory()
25
  temp_filepath = os.path.join(temp_dir.name, text_docs.name)
@@ -29,6 +31,7 @@ def get_text_file(text_docs):
29
  text_doc = text_loader.load()
30
  return text_doc
31
 
 
32
  def get_csv_file(csv_docs):
33
  temp_dir = tempfile.TemporaryDirectory()
34
  temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
@@ -38,6 +41,7 @@ def get_csv_file(csv_docs):
38
  csv_doc = csv_loader.load()
39
  return csv_doc
40
 
 
41
  def get_json_file(json_docs):
42
  temp_dir = tempfile.TemporaryDirectory()
43
  temp_filepath = os.path.join(temp_dir.name, json_docs.name)
@@ -47,20 +51,26 @@ def get_json_file(json_docs):
47
  json_doc = json_loader.load()
48
  return json_doc
49
 
 
50
  def get_text_chunks(documents):
51
  text_splitter = RecursiveCharacterTextSplitter(
52
  chunk_size=300,
53
  chunk_overlap=100,
54
  length_function=len
55
  )
 
56
  documents = text_splitter.split_documents(documents)
57
  return documents
58
 
 
59
  def get_vectorstore(text_chunks):
60
  embeddings = HuggingFaceEmbeddings(model_name="WhereIsAI/UAE-Large-V1")
61
  vectorstore = FAISS.from_documents(text_chunks, embeddings)
62
  return vectorstore
63
-
 
 
 
64
  def get_conversation_chain(vectorstore, tokenH):
65
  if not tokenH:
66
  raise ValueError("API token is required to initialize the HuggingFaceHub model")
@@ -81,6 +91,8 @@ def get_conversation_chain(vectorstore, tokenH):
81
  except Exception as e:
82
  raise ValueError(f"Error generating response: {str(e)}")
83
 
 
 
84
  def conversation_chain(user_input):
85
  retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
86
  documents = retriever.get_relevant_documents(user_input)
@@ -90,23 +102,42 @@ def get_conversation_chain(vectorstore, tokenH):
90
 
91
  return conversation_chain
92
 
 
93
  def handle_userinput(user_question):
 
94
  if "chat_history" not in st.session_state:
95
  st.session_state.chat_history = []
96
 
 
97
  response = st.session_state.conversation(user_question)
 
 
98
  st.session_state.chat_history.append({"role": "user", "content": user_question})
99
  st.session_state.chat_history.append({"role": "assistant", "content": response})
100
 
101
- def display_chat_history():
102
- if "chat_history" in st.session_state and st.session_state.chat_history:
103
- for message in st.session_state.chat_history:
104
- if message["role"] == "user":
105
- st.write(f"User: {message['content']}")
106
- else:
107
- st.write(f"Bot: {message['content']}")
108
- else:
109
- st.write("No chat history to display.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def main():
112
  st.set_page_config(page_title="Chat with multiple Files", page_icon=":books:")
@@ -117,11 +148,13 @@ def main():
117
  st.warning("Please enter a valid HuggingFace API token.")
118
  return
119
 
 
120
  if "conversation" not in st.session_state:
121
  st.session_state.conversation = None
122
  if "chat_history" not in st.session_state:
123
  st.session_state.chat_history = []
124
 
 
125
  user_question = st.text_input("Ask a question about your documents:")
126
  if user_question:
127
  if st.session_state.conversation:
@@ -129,9 +162,7 @@ def main():
129
  else:
130
  st.warning("Please upload and process files first!")
131
 
132
- if st.button("Display Chat History"):
133
- display_chat_history()
134
-
135
  docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
136
  if st.button("Process"):
137
  with st.spinner("Processing"):
@@ -147,209 +178,19 @@ def main():
147
  elif file.type == 'application/json':
148
  doc_list.extend(get_json_file(file))
149
 
 
150
  text_chunks = get_text_chunks(doc_list)
 
 
151
  vectorstore = get_vectorstore(text_chunks)
 
 
152
  st.session_state.conversation = get_conversation_chain(vectorstore, tokenH)
153
  st.success("Documents processed successfully!")
154
  else:
155
  st.warning("Please upload at least one document to process.")
156
 
 
157
  if __name__ == '__main__':
158
  main()
159
 
160
- # import streamlit as st
161
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
162
- # from langchain_community.embeddings import HuggingFaceEmbeddings
163
- # from langchain_community.vectorstores import FAISS
164
- # from langchain.chat_models import ChatOpenAI
165
- # from langchain.memory import ConversationBufferMemory
166
- # from langchain.chains import ConversationalRetrievalChain
167
- # from huggingface_hub import InferenceClient
168
- # import tempfile
169
- # import os
170
- # from langchain_community.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
171
- # from htmlTemplates import css, bot_template, user_template
172
-
173
-
174
- # def get_pdf_text(pdf_docs):
175
- # temp_dir = tempfile.TemporaryDirectory()
176
- # temp_filepath = os.path.join(temp_dir.name, pdf_docs.name)
177
- # with open(temp_filepath, "wb") as f:
178
- # f.write(pdf_docs.getvalue())
179
- # pdf_loader = PyPDFLoader(temp_filepath)
180
- # pdf_doc = pdf_loader.load()
181
- # return pdf_doc
182
-
183
-
184
- # def get_text_file(text_docs):
185
- # temp_dir = tempfile.TemporaryDirectory()
186
- # temp_filepath = os.path.join(temp_dir.name, text_docs.name)
187
- # with open(temp_filepath, "wb") as f:
188
- # f.write(text_docs.getvalue())
189
- # text_loader = TextLoader(temp_filepath)
190
- # text_doc = text_loader.load()
191
- # return text_doc
192
-
193
-
194
- # def get_csv_file(csv_docs):
195
- # temp_dir = tempfile.TemporaryDirectory()
196
- # temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
197
- # with open(temp_filepath, "wb") as f:
198
- # f.write(csv_docs.getvalue())
199
- # csv_loader = CSVLoader(temp_filepath)
200
- # csv_doc = csv_loader.load()
201
- # return csv_doc
202
-
203
-
204
- # def get_json_file(json_docs):
205
- # temp_dir = tempfile.TemporaryDirectory()
206
- # temp_filepath = os.path.join(temp_dir.name, json_docs.name)
207
- # with open(temp_filepath, "wb") as f:
208
- # f.write(json_docs.getvalue())
209
- # json_loader = JSONLoader(temp_filepath)
210
- # json_doc = json_loader.load()
211
- # return json_doc
212
-
213
-
214
- # def get_text_chunks(documents):
215
- # text_splitter = RecursiveCharacterTextSplitter(
216
- # chunk_size=300,
217
- # chunk_overlap=100,
218
- # length_function=len
219
- # )
220
-
221
- # documents = text_splitter.split_documents(documents)
222
- # return documents
223
-
224
-
225
- # def get_vectorstore(text_chunks):
226
- # embeddings = HuggingFaceEmbeddings(model_name="WhereIsAI/UAE-Large-V1")
227
- # vectorstore = FAISS.from_documents(text_chunks, embeddings)
228
- # return vectorstore
229
- # #sentence-transformers/all-MiniLM-L6-v2
230
- # #HuggingFaceH4/zephyr-7b-alpha
231
- # #Qwen/Qwen2.5-72B-Instruct
232
- # #mistralai/Mistral-7B-Instruct-v0.2
233
- # def get_conversation_chain(vectorstore, tokenH):
234
- # if not tokenH:
235
- # raise ValueError("API token is required to initialize the HuggingFaceHub model")
236
-
237
- # try:
238
- # client = InferenceClient(api_key=tokenH)
239
- # except Exception as e:
240
- # raise ValueError(f"Error initializing HuggingFace InferenceClient: {str(e)}")
241
-
242
- # def generate_response(messages):
243
- # try:
244
- # completion = client.chat.completions.create(
245
- # model="Qwen/Qwen2.5-72B-Instruct",
246
- # messages=messages,
247
- # max_tokens=500
248
- # )
249
- # return completion.choices[0].message['content']
250
- # except Exception as e:
251
- # raise ValueError(f"Error generating response: {str(e)}")
252
-
253
- # # messages = [{"role": "user", "content": user_input}, {"role": "system", "content": documents_text}]
254
-
255
- # def conversation_chain(user_input):
256
- # retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
257
- # documents = retriever.get_relevant_documents(user_input)
258
- # documents_text = "\n".join(doc.page_content for doc in documents)
259
- # messages = [{"role": "user", "content": user_input}, {"role": "system", "content": documents_text}]
260
- # return generate_response(messages)
261
-
262
- # return conversation_chain
263
-
264
-
265
- # def handle_userinput(user_question):
266
- # # Ensure chat_history is initialized
267
- # if "chat_history" not in st.session_state:
268
- # st.session_state.chat_history = []
269
-
270
- # # Get the response from the conversation
271
- # response = st.session_state.conversation(user_question)
272
-
273
- # # Append the user's question and the assistant's response to chat history
274
- # st.session_state.chat_history.append({"role": "user", "content": user_question})
275
- # st.session_state.chat_history.append({"role": "assistant", "content": response})
276
-
277
- # # Display the chat history
278
- # for message in st.session_state.chat_history:
279
- # if message["role"] == "user":
280
- # st.write(f"<div style='color: gray;'>User: {message['content']}</div>", unsafe_allow_html=True)
281
- # else:
282
- # st.write(f"<div style='color: black;'>Bot: {message['content']}</div>", unsafe_allow_html=True)
283
-
284
- # # for i, message in enumerate(st.session_state.chat_history):
285
- # # if i % 2 == 0:
286
- # # # Display user messages
287
- # # st.write(user_template.replace("{{MSG}}", message["content"]), unsafe_allow_html=True)
288
- # # else:
289
- # # # Display assistant messages
290
- # # st.write(bot_template.replace("{{MSG}}", message["content"]), unsafe_allow_html=True)
291
-
292
- # # for i, message in enumerate(st.session_state.chat_history):
293
- # # if i % 2 == 0:
294
- # # st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
295
- # # # st.write(f"<div style='color: gray;'>User: {message['content']}</div>", unsafe_allow_html=True)
296
- # # else:
297
- # # st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
298
- # # # st.write(f"<div style='color: black;'>Bot: {message['content']}</div>", unsafe_allow_html=True)
299
-
300
-
301
- # def main():
302
- # st.set_page_config(page_title="Chat with multiple Files", page_icon=":books:")
303
- # st.header("Chat with Multiple Files")
304
- # tokenH = st.text_input("Paste your HuggingFace API Token (sk-...)")
305
-
306
- # if not tokenH:
307
- # st.warning("Please enter a valid HuggingFace API token.")
308
- # return
309
-
310
- # # Initialize session state variables
311
- # if "conversation" not in st.session_state:
312
- # st.session_state.conversation = None
313
- # if "chat_history" not in st.session_state:
314
- # st.session_state.chat_history = []
315
-
316
- # # User input for questions
317
- # user_question = st.text_input("Ask a question about your documents:")
318
- # if user_question:
319
- # if st.session_state.conversation:
320
- # handle_userinput(user_question)
321
- # else:
322
- # st.warning("Please upload and process files first!")
323
-
324
- # # File uploader and processing
325
- # docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
326
- # if st.button("Process"):
327
- # with st.spinner("Processing"):
328
- # if docs:
329
- # doc_list = []
330
- # for file in docs:
331
- # if file.type == 'text/plain':
332
- # doc_list.extend(get_text_file(file))
333
- # elif file.type in ['application/octet-stream', 'application/pdf']:
334
- # doc_list.extend(get_pdf_text(file))
335
- # elif file.type == 'text/csv':
336
- # doc_list.extend(get_csv_file(file))
337
- # elif file.type == 'application/json':
338
- # doc_list.extend(get_json_file(file))
339
-
340
- # # Generate text chunks
341
- # text_chunks = get_text_chunks(doc_list)
342
-
343
- # # Create vector store
344
- # vectorstore = get_vectorstore(text_chunks)
345
-
346
- # # Initialize conversation chain
347
- # st.session_state.conversation = get_conversation_chain(vectorstore, tokenH)
348
- # st.success("Documents processed successfully!")
349
- # else:
350
- # st.warning("Please upload at least one document to process.")
351
-
352
-
353
- # if __name__ == '__main__':
354
- # main()
355
-
 
11
  from langchain_community.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
12
  from htmlTemplates import css, bot_template, user_template
13
 
14
+
15
  def get_pdf_text(pdf_docs):
16
  temp_dir = tempfile.TemporaryDirectory()
17
  temp_filepath = os.path.join(temp_dir.name, pdf_docs.name)
 
21
  pdf_doc = pdf_loader.load()
22
  return pdf_doc
23
 
24
+
25
  def get_text_file(text_docs):
26
  temp_dir = tempfile.TemporaryDirectory()
27
  temp_filepath = os.path.join(temp_dir.name, text_docs.name)
 
31
  text_doc = text_loader.load()
32
  return text_doc
33
 
34
+
35
  def get_csv_file(csv_docs):
36
  temp_dir = tempfile.TemporaryDirectory()
37
  temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
 
41
  csv_doc = csv_loader.load()
42
  return csv_doc
43
 
44
+
45
  def get_json_file(json_docs):
46
  temp_dir = tempfile.TemporaryDirectory()
47
  temp_filepath = os.path.join(temp_dir.name, json_docs.name)
 
51
  json_doc = json_loader.load()
52
  return json_doc
53
 
54
+
55
  def get_text_chunks(documents):
56
  text_splitter = RecursiveCharacterTextSplitter(
57
  chunk_size=300,
58
  chunk_overlap=100,
59
  length_function=len
60
  )
61
+
62
  documents = text_splitter.split_documents(documents)
63
  return documents
64
 
65
+
66
  def get_vectorstore(text_chunks):
67
  embeddings = HuggingFaceEmbeddings(model_name="WhereIsAI/UAE-Large-V1")
68
  vectorstore = FAISS.from_documents(text_chunks, embeddings)
69
  return vectorstore
70
+ #sentence-transformers/all-MiniLM-L6-v2
71
+ #HuggingFaceH4/zephyr-7b-alpha
72
+ #Qwen/Qwen2.5-72B-Instruct
73
+ #mistralai/Mistral-7B-Instruct-v0.2
74
  def get_conversation_chain(vectorstore, tokenH):
75
  if not tokenH:
76
  raise ValueError("API token is required to initialize the HuggingFaceHub model")
 
91
  except Exception as e:
92
  raise ValueError(f"Error generating response: {str(e)}")
93
 
94
+ # messages = [{"role": "user", "content": user_input}, {"role": "system", "content": documents_text}]
95
+
96
  def conversation_chain(user_input):
97
  retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
98
  documents = retriever.get_relevant_documents(user_input)
 
102
 
103
  return conversation_chain
104
 
105
+
106
  def handle_userinput(user_question):
107
+ # Ensure chat_history is initialized
108
  if "chat_history" not in st.session_state:
109
  st.session_state.chat_history = []
110
 
111
+ # Get the response from the conversation
112
  response = st.session_state.conversation(user_question)
113
+
114
+ # Append the user's question and the assistant's response to chat history
115
  st.session_state.chat_history.append({"role": "user", "content": user_question})
116
  st.session_state.chat_history.append({"role": "assistant", "content": response})
117
 
118
+ # Display the chat history
119
+ for message in st.session_state.chat_history:
120
+ if message["role"] == "user":
121
+ st.write(f"<div style='color: gray;'>User: {message['content']}</div>", unsafe_allow_html=True)
122
+ else:
123
+ st.write(f"<div style='color: black;'>Bot: {message['content']}</div>", unsafe_allow_html=True)
124
+
125
+ # for i, message in enumerate(st.session_state.chat_history):
126
+ # if i % 2 == 0:
127
+ # # Display user messages
128
+ # st.write(user_template.replace("{{MSG}}", message["content"]), unsafe_allow_html=True)
129
+ # else:
130
+ # # Display assistant messages
131
+ # st.write(bot_template.replace("{{MSG}}", message["content"]), unsafe_allow_html=True)
132
+
133
+ # for i, message in enumerate(st.session_state.chat_history):
134
+ # if i % 2 == 0:
135
+ # st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
136
+ # # st.write(f"<div style='color: gray;'>User: {message['content']}</div>", unsafe_allow_html=True)
137
+ # else:
138
+ # st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
139
+ # # st.write(f"<div style='color: black;'>Bot: {message['content']}</div>", unsafe_allow_html=True)
140
+
141
 
142
  def main():
143
  st.set_page_config(page_title="Chat with multiple Files", page_icon=":books:")
 
148
  st.warning("Please enter a valid HuggingFace API token.")
149
  return
150
 
151
+ # Initialize session state variables
152
  if "conversation" not in st.session_state:
153
  st.session_state.conversation = None
154
  if "chat_history" not in st.session_state:
155
  st.session_state.chat_history = []
156
 
157
+ # User input for questions
158
  user_question = st.text_input("Ask a question about your documents:")
159
  if user_question:
160
  if st.session_state.conversation:
 
162
  else:
163
  st.warning("Please upload and process files first!")
164
 
165
+ # File uploader and processing
 
 
166
  docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
167
  if st.button("Process"):
168
  with st.spinner("Processing"):
 
178
  elif file.type == 'application/json':
179
  doc_list.extend(get_json_file(file))
180
 
181
+ # Generate text chunks
182
  text_chunks = get_text_chunks(doc_list)
183
+
184
+ # Create vector store
185
  vectorstore = get_vectorstore(text_chunks)
186
+
187
+ # Initialize conversation chain
188
  st.session_state.conversation = get_conversation_chain(vectorstore, tokenH)
189
  st.success("Documents processed successfully!")
190
  else:
191
  st.warning("Please upload at least one document to process.")
192
 
193
+
194
  if __name__ == '__main__':
195
  main()
196