Krish30 commited on
Commit
f1fff07
·
verified ·
1 Parent(s): 94e69f3

Upload 5 files

Browse files
Files changed (5) hide show
  1. app_notes_ai.py +530 -0
  2. chat_history.db +0 -0
  3. config.json +1 -0
  4. requirements.txt +12 -0
  5. vectorize_documents.py +86 -0
app_notes_ai.py ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import sqlite3
4
+ from datetime import datetime
5
+ import streamlit as st
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_chroma import Chroma
8
+ from langchain_groq import ChatGroq
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+
12
+ from vectorize_documents import embeddings
13
+
14
+ working_dir = os.path.dirname(os.path.abspath(__file__))
15
+ config_data = json.load(open(f"{working_dir}/config.json"))
16
+ GROQ_API_KEY = config_data["GROQ_API_KEY"]
17
+ os.environ["GROQ_API_KEY"]= GROQ_API_KEY
18
+
19
+ # Set up the database with check_same_thread=False
20
+ def setup_db():
21
+ conn = sqlite3.connect("chat_history.db", check_same_thread=False) # Ensure thread-safe connection
22
+ cursor = conn.cursor()
23
+ cursor.execute("""
24
+ CREATE TABLE IF NOT EXISTS chat_histories (
25
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
26
+ username TEXT,
27
+ timestamp TEXT,
28
+ day TEXT,
29
+ user_message TEXT,
30
+ assistant_response TEXT
31
+ )
32
+ """)
33
+ conn.commit()
34
+ return conn # Return the connection
35
+
36
+ # Function to save chat history to SQLite
37
+ def save_chat_history(conn, username, timestamp, day, user_message, assistant_response):
38
+ cursor = conn.cursor()
39
+ cursor.execute("""
40
+ INSERT INTO chat_histories (username, timestamp, day, user_message, assistant_response)
41
+ VALUES (?, ?, ?, ?, ?)
42
+ """, (username, timestamp, day, user_message, assistant_response))
43
+ conn.commit()
44
+
45
+ # Function to set up vectorstore for embeddings
46
+ def setup_vectorstore():
47
+ embeddings = HuggingFaceEmbeddings()
48
+ vectorstore = Chroma(persist_directory="vector_db_2R", embedding_function=embeddings)
49
+ return vectorstore
50
+
51
+ # Function to set up the chatbot chain
52
+ def chat_chain(vectorstore):
53
+ llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0)
54
+ retriever = vectorstore.as_retriever()
55
+ memory = ConversationBufferMemory(
56
+ llm=llm,
57
+ output_key="answer",
58
+ memory_key="chat_history",
59
+ return_messages=True
60
+ )
61
+ chain = ConversationalRetrievalChain.from_llm(
62
+ llm=llm,
63
+ retriever=retriever,
64
+ chain_type="stuff",
65
+ memory=memory,
66
+ verbose=True,
67
+ return_source_documents=True
68
+ )
69
+ return chain
70
+
71
+ # Streamlit UI setup
72
+ st.set_page_config(page_title="Notes.AI", page_icon="🤖AI", layout="centered")
73
+
74
+ st.title("🤖 Notes.AI")
75
+ st.subheader("Hey! Here you can search for notes of CSE 7th Sem! Read Notes, Read PYQ answers also!!")
76
+
77
+ # Step 1: Initialize the connection and check if the user is already logged in
78
+ if "conn" not in st.session_state:
79
+ st.session_state.conn = setup_db()
80
+
81
+ if "username" not in st.session_state:
82
+ username = st.text_input("Enter your name to proceed:")
83
+ if username:
84
+ with st.spinner("Loading chatbot interface... Please wait."):
85
+ st.session_state.username = username
86
+ st.session_state.chat_history = [] # Initialize empty chat history in memory
87
+ st.session_state.vectorstore = setup_vectorstore()
88
+ st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore)
89
+ st.success(f"Welcome, {username}! The chatbot interface is ready.")
90
+ else:
91
+ username = st.session_state.username
92
+
93
+ # Step 2: Initialize components if not already set
94
+ if "conversational_chain" not in st.session_state:
95
+ st.session_state.vectorstore = setup_vectorstore()
96
+ st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore)
97
+
98
+ # Step 3: Display the chat history in the UI
99
+ if "username" in st.session_state:
100
+ st.subheader(f"Hello {username}, start your query below!")
101
+
102
+ # Display chat history (messages exchanged between user and assistant)
103
+ if st.session_state.chat_history:
104
+ for message in st.session_state.chat_history:
105
+ if message['role'] == 'user':
106
+ with st.chat_message("user"):
107
+ st.markdown(message["content"])
108
+ elif message['role'] == 'assistant':
109
+ with st.chat_message("assistant"):
110
+ st.markdown(message["content"])
111
+
112
+ # Input field for the user to type their message
113
+ user_input = st.chat_input("Ask AI....")
114
+
115
+ if user_input:
116
+ with st.spinner("Processing your query... Please wait."):
117
+ # Save user input to chat history in memory
118
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
119
+
120
+ # Display user's message in chatbot (for UI display)
121
+ with st.chat_message("user"):
122
+ st.markdown(user_input)
123
+
124
+ # Get assistant's response from the chain
125
+ with st.chat_message("assistant"):
126
+ response = st.session_state.conversational_chain({"question": user_input})
127
+ assistant_response = response["answer"]
128
+ st.markdown(assistant_response)
129
+
130
+ # Save assistant's response to chat history in memory
131
+ st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
132
+
133
+ # Save the chat history to the database (SQLite)
134
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
135
+ day = datetime.now().strftime("%A") # Get the day of the week (e.g., Monday)
136
+ save_chat_history(st.session_state.conn, username, timestamp, day, user_input, assistant_response)
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+ # # Set up the database with check_same_thread=False
157
+ # def setup_db():
158
+ # conn = sqlite3.connect("chat_history.db", check_same_thread=False) # Ensure thread-safe connection
159
+ # cursor = conn.cursor()
160
+ # cursor.execute("""
161
+ # CREATE TABLE IF NOT EXISTS chat_histories (
162
+ # id INTEGER PRIMARY KEY AUTOINCREMENT,
163
+ # username TEXT,
164
+ # timestamp TEXT,
165
+ # day TEXT,
166
+ # user_message TEXT,
167
+ # assistant_response TEXT
168
+ # )
169
+ # """)
170
+ # conn.commit()
171
+ # return conn # Return the connection
172
+
173
+ # # Function to save chat history to SQLite
174
+ # def save_chat_history(conn, username, timestamp, day, user_message, assistant_response):
175
+ # cursor = conn.cursor()
176
+ # cursor.execute("""
177
+ # INSERT INTO chat_histories (username, timestamp, day, user_message, assistant_response)
178
+ # VALUES (?, ?, ?, ?, ?)
179
+ # """, (username, timestamp, day, user_message, assistant_response))
180
+ # conn.commit()
181
+
182
+ # # Function to load chat history from SQLite
183
+ # def load_chat_history(conn, username):
184
+ # cursor = conn.cursor()
185
+ # cursor.execute("""
186
+ # SELECT timestamp, day, user_message, assistant_response
187
+ # FROM chat_histories
188
+ # WHERE username = ?
189
+ # ORDER BY timestamp
190
+ # """, (username,))
191
+ # chat_history = cursor.fetchall()
192
+ # return chat_history
193
+
194
+ # # Function to set up vectorstore for embeddings
195
+ # def setup_vectorstore():
196
+ # embeddings = HuggingFaceEmbeddings()
197
+ # vectorstore = Chroma(persist_directory="vector_db_dir_notes_ai", embedding_function=embeddings)
198
+ # return vectorstore
199
+
200
+ # # Function to set up the chatbot chain
201
+ # def chat_chain(vectorstore):
202
+ # llm = ChatGroq(
203
+ # model="llama-3.1-70b-versatile",
204
+ # temperature=0
205
+ # )
206
+ # retriever = vectorstore.as_retriever()
207
+ # memory = ConversationBufferMemory(
208
+ # llm=llm,
209
+ # output_key="answer",
210
+ # memory_key="chat_history",
211
+ # return_messages=True
212
+ # )
213
+ # chain = ConversationalRetrievalChain.from_llm(
214
+ # llm=llm,
215
+ # retriever=retriever,
216
+ # chain_type="stuff",
217
+ # memory=memory,
218
+ # verbose=True,
219
+ # return_source_documents=True
220
+ # )
221
+ # return chain
222
+
223
+ # # Streamlit UI setup
224
+ # st.set_page_config(
225
+ # page_title="Notes.AI",
226
+ # page_icon="🤖AI",
227
+ # layout="centered"
228
+ # )
229
+
230
+ # st.title("🤖 Notes.AI")
231
+ # st.subheader("Hey! Here you can search for notes of CSE 7th Sem! Read Notes, Read PYQ answers also!!")
232
+
233
+ # # Step 1: Initialize the connection and check if the user is already logged in
234
+ # if "conn" not in st.session_state:
235
+ # st.session_state.conn = setup_db()
236
+
237
+ # if "username" not in st.session_state:
238
+ # username = st.text_input("Enter your name to proceed:")
239
+ # if username:
240
+ # with st.spinner("Loading chatbot interface... Please wait."):
241
+ # st.session_state.username = username
242
+ # st.session_state.chat_history = []
243
+ # st.session_state.vectorstore = setup_vectorstore()
244
+ # st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore)
245
+ # st.success(f"Welcome, {username}! The chatbot interface is ready.")
246
+ # else:
247
+ # username = st.session_state.username
248
+
249
+ # # Step 2: Initialize components if not already set
250
+ # if "conversational_chain" not in st.session_state:
251
+ # st.session_state.vectorstore = setup_vectorstore()
252
+ # st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore)
253
+
254
+ # # Step 3: Show chatbot interface
255
+ # if "username" in st.session_state:
256
+ # st.subheader(f"Hello {username}, start your query below!")
257
+
258
+ # user_input = st.chat_input("Ask AI....")
259
+ # if user_input:
260
+ # with st.spinner("Processing your query... Please wait."):
261
+ # # Save user input to chat history
262
+ # st.session_state.chat_history.append({"role": "user", "content": user_input})
263
+
264
+ # # Display user's message
265
+ # with st.chat_message("user"):
266
+ # st.markdown(user_input)
267
+
268
+ # # Get assistant's response
269
+ # with st.chat_message("assistant"):
270
+ # response = st.session_state.conversational_chain({"question": user_input})
271
+ # assistant_response = response["answer"]
272
+ # st.markdown(assistant_response)
273
+
274
+ # # Save response to chat history
275
+ # st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
276
+
277
+ # # Save chat history to SQLite database with timestamp
278
+ # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
279
+ # day = datetime.now().strftime("%A") # Get the day of the week (e.g., Monday)
280
+ # save_chat_history(st.session_state.conn, username, timestamp, day, user_input, assistant_response)
281
+
282
+ # # Display chat history for the current user
283
+ # if "username" in st.session_state:
284
+ # st.subheader(f"Chat History for {username}:")
285
+
286
+ # chat_history = load_chat_history(st.session_state.conn, username)
287
+ # if chat_history:
288
+ # for entry in chat_history:
289
+ # timestamp, day, user_message, assistant_response = entry
290
+ # st.write(f"**{day} - {timestamp}:**")
291
+ # st.write(f"**User:** {user_message}")
292
+ # st.write(f"**Assistant:** {assistant_response}")
293
+ # else:
294
+ # st.write("No chat history available.")
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+ # import os
303
+ # import json
304
+ # from datetime import datetime
305
+ # import streamlit as st
306
+ # from langchain_huggingface import HuggingFaceEmbeddings
307
+ # from langchain_chroma import Chroma
308
+ # from langchain_groq import ChatGroq
309
+ # from langchain.memory import ConversationBufferMemory
310
+ # from langchain.chains import ConversationalRetrievalChain
311
+
312
+
313
+ # # Ensure the JSON file exists
314
+ # chat_history_file = "chat_histories.json"
315
+ # if not os.path.exists(chat_history_file):
316
+ # with open(chat_history_file, "w") as f:
317
+ # json.dump({}, f)
318
+
319
+ # # Functions to handle chat history
320
+ # def load_chat_history():
321
+ # with open(chat_history_file, "r") as f:
322
+ # return json.load(f)
323
+
324
+ # def save_chat_history(chat_histories):
325
+ # with open(chat_history_file, "w") as f:
326
+ # json.dump(chat_histories, f, indent=4)
327
+
328
+ # # Function to set up vectorstore
329
+ # def setup_vectorstore():
330
+ # embeddings = HuggingFaceEmbeddings()
331
+ # vectorstore = Chroma(persist_directory="vector_db_dir_notes_ai",
332
+ # embedding_function=embeddings)
333
+ # return vectorstore
334
+
335
+ # # Function to set up chatbot chain
336
+ # def chat_chain(vectorstore):
337
+ # llm = ChatGroq(
338
+ # model="llama-3.1-70b-versatile",
339
+ # temperature=0
340
+ # )
341
+ # retriever = vectorstore.as_retriever()
342
+ # memory = ConversationBufferMemory(
343
+ # llm=llm,
344
+ # output_key="answer",
345
+ # memory_key="chat_history",
346
+ # return_messages=True
347
+ # )
348
+ # chain = ConversationalRetrievalChain.from_llm(
349
+ # llm=llm,
350
+ # retriever=retriever,
351
+ # chain_type="stuff",
352
+ # memory=memory,
353
+ # verbose=True,
354
+ # return_source_documents=True
355
+ # )
356
+ # return chain
357
+
358
+ # # Streamlit UI
359
+ # st.set_page_config(
360
+ # page_title="Notes.AI",
361
+ # page_icon="🤖AI",
362
+ # layout="centered"
363
+ # )
364
+
365
+ # st.title("🤖 Notes.AI")
366
+ # st.subheader("Hey! Here you can search for notes of CSE 7th Sem! Read Notes, Read PYQ answers also!!")
367
+
368
+ # # Step 1: Input user's name
369
+ # if "username" not in st.session_state:
370
+ # username = st.text_input("Enter your name to proceed:")
371
+ # if username:
372
+ # with st.spinner("Loading chatbot interface... Please wait."):
373
+ # st.session_state.username = username
374
+ # st.session_state.chat_history = [] # Initialize empty chat history
375
+ # st.session_state.vectorstore = setup_vectorstore()
376
+ # st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore)
377
+ # st.success(f"Welcome, {username}! The chatbot interface is ready.")
378
+ # else:
379
+ # username = st.session_state.username
380
+
381
+ # # Step 2: Initialize components if not already set
382
+ # if "conversational_chain" not in st.session_state:
383
+ # st.session_state.vectorstore = setup_vectorstore()
384
+ # st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore)
385
+
386
+ # # Step 3: Show chatbot interface
387
+ # if "username" in st.session_state:
388
+ # st.subheader(f"Hello {username}, start your query below!")
389
+
390
+ # # Display existing chat history dynamically
391
+ # for message in st.session_state.chat_history:
392
+ # if message["role"] == "user":
393
+ # with st.chat_message("user"):
394
+ # st.markdown(f"{message['day']}: {message['content']}")
395
+ # elif message["role"] == "assistant":
396
+ # with st.chat_message("assistant"):
397
+ # st.markdown(f"{message['day']}: {message['content']}")
398
+
399
+ # # User input section
400
+ # user_input = st.chat_input("Ask AI....")
401
+ # if user_input:
402
+ # with st.spinner("Processing your query... Please wait."):
403
+ # # Save user input to session state
404
+ # st.session_state.chat_history.append({"role": "user", "content": user_input})
405
+
406
+ # # Display user's message
407
+ # with st.chat_message("user"):
408
+ # st.markdown(user_input)
409
+
410
+ # # Get assistant's response
411
+ # with st.chat_message("assistant"):
412
+ # response = st.session_state.conversational_chain({"question": user_input})
413
+ # assistant_response = response["answer"]
414
+ # st.markdown(assistant_response)
415
+
416
+ # # Save assistant's response to session state
417
+ # st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
418
+
419
+ # # Save chat history to file with timestamp
420
+ # chat_histories = load_chat_history()
421
+ # timestamp = datetime.now()
422
+ # day = timestamp.strftime("%A") # Get the full weekday name (e.g., Monday)
423
+ # formatted_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
424
+ # if username not in chat_histories:
425
+ # chat_histories[username] = []
426
+ # chat_histories[username].append({
427
+ # "timestamp": formatted_timestamp,
428
+ # "day": day,
429
+ # "user": user_input,
430
+ # "assistant": assistant_response
431
+ # })
432
+ # save_chat_history(chat_histories)
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+
443
+ # import os
444
+ # import json
445
+
446
+ # import streamlit as st
447
+ # from langchain_huggingface import HuggingFaceEmbeddings
448
+ # from langchain_chroma import Chroma
449
+ # from langchain_groq import ChatGroq
450
+ # from langchain.memory import ConversationBufferMemory
451
+ # from langchain.chains import ConversationalRetrievalChain
452
+
453
+ # from vectorize_documents import embeddings
454
+
455
+
456
+ # working_dir = os.path.dirname(os.path.abspath(__file__))
457
+ # config_data = json.load(open(f"{working_dir}/config.json"))
458
+ # GROQ_API_KEY = config_data["GROQ_API_KEY"]
459
+ # os.environ["GROQ_API_KEY"]= GROQ_API_KEY
460
+
461
+
462
+ # def setup_vectorstore():
463
+ # persist_directory = f"{working_dir}/vector_db_dir_notes_ai"
464
+ # embeddings = HuggingFaceEmbeddings()
465
+ # vectorstore = Chroma(persist_directory=persist_directory,
466
+ # embedding_function=embeddings)
467
+ # return vectorstore
468
+
469
+ # def chat_chain(vectorstore):
470
+ # llm = ChatGroq(
471
+ # model = "llama-3.1-70b-versatile",
472
+ # temperature = 0
473
+ # )
474
+ # retriever = vectorstore.as_retriever()
475
+ # memory = ConversationBufferMemory(
476
+ # llm = llm,
477
+ # output_key = "answer",
478
+ # memory_key = "chat_history",
479
+ # return_messages = True
480
+ # )
481
+ # chain = ConversationalRetrievalChain.from_llm(
482
+ # llm=llm,
483
+ # retriever = retriever,
484
+ # chain_type = "stuff",
485
+ # memory = memory,
486
+ # verbose=True,
487
+ # return_source_documents= True
488
+ # )
489
+ # return chain
490
+
491
+ # st.set_page_config(
492
+ # page_title="Notes.AI",
493
+ # page_icon="🤖AI",
494
+ # layout="centered"
495
+ # )
496
+
497
+ # st.title("🤖 Notes.AI")
498
+
499
+ # # st.title("🤖 Hey! Here you can search for notes of CSE 7th Sem! Read Notes, Read PYQ answers also!!")
500
+
501
+ # st.subheader("Hey! Here you can search for notes of CSE 7th Sem! Read Notes, Read PYQ answers also!!")
502
+
503
+ # # Additional subheading
504
+ # st.subheader("Start your query below to get instant help!")
505
+
506
+ # if "chat_history" not in st.session_state:
507
+ # st.session_state.chat_history = []
508
+
509
+ # if "vectorstore" not in st.session_state:
510
+ # st.session_state.vectorstore = setup_vectorstore()
511
+
512
+ # if "conversational_chain" not in st.session_state:
513
+ # st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore)
514
+
515
+ # for message in st.session_state.chat_history:
516
+ # with st.chat_message(message["role"]):
517
+ # st.markdown(message["content"])
518
+ # user_input = st.chat_input("Ask AI....")
519
+
520
+ # if user_input:
521
+ # st.session_state.chat_history.append({"role":"user", "content":user_input})
522
+
523
+ # with st.chat_message("user"):
524
+ # st.markdown(user_input)
525
+
526
+ # with st.chat_message("assistant"):
527
+ # response = st.session_state.conversational_chain({"question":user_input})
528
+ # assistant_response = response["answer"]
529
+ # st.markdown(assistant_response)
530
+ # st.session_state.chat_history.append({"role":"assistant","content": assistant_response})
chat_history.db ADDED
Binary file (41 kB). View file
 
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GROQ_API_KEY": "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"}
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.38.0
2
+ langchain-community==0.2.16
3
+ langchain-text-splitters==0.2.4
4
+ langchain-chroma==0.1.3
5
+ langchain-huggingface==0.0.3
6
+ langchain-groq==0.1.9
7
+ unstructured==0.15.0
8
+ unstructured[pdf]==0.15.0
9
+ nltk==3.8.1
10
+ psycopg2-binary
11
+ pgvector
12
+ langchain_postgres
vectorize_documents.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import UnstructuredFileLoader
2
+ from langchain_community.document_loaders import DirectoryLoader
3
+ from langchain_text_splitters import CharacterTextSplitter
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_chroma import Chroma
6
+
7
+
8
+ # # Define a function to perform vectorization
9
+ def vectorize_documents():
10
+ embeddings = HuggingFaceEmbeddings()
11
+
12
+ loader = DirectoryLoader(
13
+ path="Data_2R",
14
+ glob="./*.pdf",
15
+ loader_cls=UnstructuredFileLoader
16
+ )
17
+
18
+ documents = loader.load()
19
+
20
+ # Splitting the text and creating chunks of these documents.
21
+ text_splitter = CharacterTextSplitter(
22
+ chunk_size=2000,
23
+ chunk_overlap=500
24
+ )
25
+
26
+ text_chunks = text_splitter.split_documents(documents)
27
+
28
+ # Store in Chroma vector DB
29
+ vectordb = Chroma.from_documents(
30
+ documents=text_chunks,
31
+ embedding=embeddings,
32
+ persist_directory="vector_db_2R"
33
+ )
34
+
35
+ print("Documents Vectorized and saved in VectorDB")
36
+
37
+
38
+
39
+ # Expose embeddings if needed
40
+ embeddings = HuggingFaceEmbeddings()
41
+
42
+
43
+ # Main guard to prevent execution on import
44
+ if __name__ == "__main__":
45
+ vectorize_documents()
46
+
47
+
48
+
49
+ # # Define a function to perform vectorization
50
+ # def vectorize_documents():
51
+ # # Loading the embedding model
52
+ # embeddings = HuggingFaceEmbeddings()
53
+
54
+ # loader = DirectoryLoader(
55
+ # path="Data",
56
+ # glob="./*.pdf",
57
+ # loader_cls=UnstructuredFileLoader
58
+ # )
59
+
60
+ # documents = loader.load()
61
+
62
+ # # Splitting the text and creating chunks of these documents.
63
+ # text_splitter = CharacterTextSplitter(
64
+ # chunk_size=2000,
65
+ # chunk_overlap=500
66
+ # )
67
+
68
+ # text_chunks = text_splitter.split_documents(documents)
69
+
70
+ # # Store in Chroma vector DB
71
+ # vectordb = Chroma.from_documents(
72
+ # documents=text_chunks,
73
+ # embedding=embeddings,
74
+ # persist_directory="vector_db_dir"
75
+ # )
76
+
77
+ # print("Documents Vectorized and saved in VectorDB")
78
+
79
+
80
+ # # Expose embeddings if needed
81
+ # embeddings = HuggingFaceEmbeddings()
82
+
83
+
84
+ # # Main guard to prevent execution on import
85
+ # if __name__ == "__main__":
86
+ # vectorize_documents()