Krish-Upgrix commited on
Commit
c14f8f8
·
verified ·
1 Parent(s): 2219e67

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +321 -0
  2. chat_history.db +0 -0
  3. config.json +1 -0
  4. requirements.txt +13 -0
  5. vectorize_documents.py +71 -0
app.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # version 2: added custom prompts.
2
+
3
+ import os
4
+ import json
5
+ import sqlite3
6
+ from datetime import datetime
7
+ import streamlit as st
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain_chroma import Chroma
10
+ from langchain_groq import ChatGroq
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chains import ConversationalRetrievalChain
13
+ from langchain.prompts import PromptTemplate
14
+
15
+ from vectorize_documents import embeddings # If needed elsewhere
16
+
17
+ # Load config
18
+ working_dir = os.path.dirname(os.path.abspath(__file__))
19
+ config_data = json.load(open(f"{working_dir}/config.json"))
20
+ GROQ_API_KEY = config_data["GROQ_API_KEY"]
21
+ os.environ["GROQ_API_KEY"] = GROQ_API_KEY
22
+
23
+ # Set up the database
24
+ def setup_db():
25
+ conn = sqlite3.connect("chat_history.db", check_same_thread=False)
26
+ cursor = conn.cursor()
27
+ cursor.execute("""
28
+ CREATE TABLE IF NOT EXISTS chat_histories (
29
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
30
+ username TEXT,
31
+ timestamp TEXT,
32
+ day TEXT,
33
+ user_message TEXT,
34
+ assistant_response TEXT
35
+ )
36
+ """)
37
+ conn.commit()
38
+ return conn
39
+
40
+ # Set up vectorstore
41
+ def setup_vectorstore():
42
+ embeddings = HuggingFaceEmbeddings()
43
+ vectorstore = Chroma(persist_directory="Vector_db", embedding_function=embeddings)
44
+ return vectorstore
45
+
46
+ # Custom prompt template
47
+ custom_prompt_template = PromptTemplate.from_template("""
48
+ You are a helpful assistant that helps users choose laptops.
49
+
50
+ 1. Analyze the user's query, take information from vectordb and then give top 3 laptops to user from Relevent information that is context.
51
+ 2. Keep suggestions clear and concise with names, specs, and reasons only from relevant information context.
52
+
53
+ Relevant Information:
54
+ {context}
55
+
56
+ Chat History:
57
+ {chat_history}
58
+
59
+ User Query:
60
+ {question}
61
+
62
+ Assistant Response:
63
+ """)
64
+
65
+
66
+ # Set up the chatbot chain with a specific model
67
+ def chat_chain(vectorstore, model_name):
68
+ llm = ChatGroq(model=model_name, temperature=0.3)
69
+ retriever = vectorstore.as_retriever()
70
+ memory = ConversationBufferMemory(
71
+ llm=llm,
72
+ output_key="answer",
73
+ memory_key="chat_history",
74
+ return_messages=True
75
+ )
76
+ chain = ConversationalRetrievalChain.from_llm(
77
+ llm=llm,
78
+ retriever=retriever,
79
+ memory=memory,
80
+ combine_docs_chain_kwargs={"prompt": custom_prompt_template},
81
+ return_source_documents=True,
82
+ verbose=True
83
+ )
84
+ return chain
85
+
86
+ # Streamlit UI setup
87
+ st.set_page_config(page_title="ByteX-Ai", page_icon="🤖AI", layout="centered")
88
+ st.title("🤖 ByteX-Ai")
89
+ st.subheader("Hey! Get your Laptop!!")
90
+
91
+ # Initialize DB connection
92
+ if "conn" not in st.session_state:
93
+ st.session_state.conn = setup_db()
94
+
95
+ # Prompt user to log in
96
+ if "username" not in st.session_state:
97
+ username = st.text_input("Enter your name to proceed:")
98
+ if username:
99
+ with st.spinner("Loading chatbot interface... Please wait."):
100
+
101
+ st.session_state.username = username
102
+ st.session_state.chat_history = []
103
+ st.session_state.vectorstore = setup_vectorstore()
104
+ st.success(f"Welcome, {username}! Now select a model to start chatting.")
105
+ else:
106
+ username = st.session_state.username
107
+
108
+ # Model selection options
109
+ model_options = [
110
+ "gemma2-9b-it",
111
+ "llama-3.1-8b-instant",
112
+ "llama3-70b-8192",
113
+ "llama3-8b-8192"
114
+ ]
115
+
116
+ selected_model = st.selectbox("Choose a model:", model_options)
117
+
118
+ # Ensure vectorstore exists
119
+ if "vectorstore" not in st.session_state:
120
+ st.session_state.vectorstore = setup_vectorstore()
121
+
122
+ # Set or update the selected model
123
+ if "selected_model" not in st.session_state:
124
+ st.session_state.selected_model = selected_model
125
+
126
+ # Reset conversational_chain if model changes or not yet initialized
127
+ if ("conversational_chain" not in st.session_state) or (st.session_state.selected_model != selected_model):
128
+ st.session_state.selected_model = selected_model
129
+ st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore, selected_model)
130
+ st.session_state.chat_history = []
131
+
132
+ # Reset chat manually
133
+ if st.button("🔄 Reset Chat"):
134
+ st.session_state.chat_history = []
135
+ st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore, st.session_state.selected_model)
136
+ st.success("Chat reset!")
137
+
138
+ # Show chat UI
139
+ if "username" in st.session_state:
140
+ st.subheader(f"Hello {username}, start your query below!")
141
+
142
+ if st.session_state.chat_history:
143
+ for message in st.session_state.chat_history:
144
+ if message['role'] == 'user':
145
+ with st.chat_message("user"):
146
+ st.markdown(message["content"])
147
+ elif message['role'] == 'assistant':
148
+ with st.chat_message("assistant"):
149
+ st.markdown(message["content"])
150
+
151
+ user_input = st.chat_input("Ask AI....")
152
+
153
+ if user_input:
154
+ with st.spinner("Processing your query... Please wait."):
155
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
156
+
157
+ with st.chat_message("user"):
158
+ st.markdown(user_input)
159
+
160
+ with st.chat_message("assistant"):
161
+ response = st.session_state.conversational_chain({"question": user_input})
162
+ assistant_response = response["answer"]
163
+ st.markdown(assistant_response)
164
+
165
+ st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+ # Version 1: working properly but there is no prompt refinement.
178
+
179
+ # import os
180
+ # import json
181
+ # import sqlite3
182
+ # from datetime import datetime
183
+ # import streamlit as st
184
+ # from langchain_huggingface import HuggingFaceEmbeddings
185
+ # from langchain_chroma import Chroma
186
+ # from langchain_groq import ChatGroq
187
+ # from langchain.memory import ConversationBufferMemory
188
+ # from langchain.chains import ConversationalRetrievalChain
189
+
190
+ # from vectorize_documents import embeddings # If needed elsewhere
191
+
192
+ # # Load config
193
+ # working_dir = os.path.dirname(os.path.abspath(__file__))
194
+ # config_data = json.load(open(f"{working_dir}/config.json"))
195
+ # GROQ_API_KEY = config_data["GROQ_API_KEY"]
196
+ # os.environ["GROQ_API_KEY"] = GROQ_API_KEY
197
+
198
+ # # Set up the database
199
+ # def setup_db():
200
+ # conn = sqlite3.connect("chat_history.db", check_same_thread=False)
201
+ # cursor = conn.cursor()
202
+ # cursor.execute("""
203
+ # CREATE TABLE IF NOT EXISTS chat_histories (
204
+ # id INTEGER PRIMARY KEY AUTOINCREMENT,
205
+ # username TEXT,
206
+ # timestamp TEXT,
207
+ # day TEXT,
208
+ # user_message TEXT,
209
+ # assistant_response TEXT
210
+ # )
211
+ # """)
212
+ # conn.commit()
213
+ # return conn
214
+
215
+ # # Set up vectorstore
216
+ # def setup_vectorstore():
217
+ # embeddings = HuggingFaceEmbeddings()
218
+ # vectorstore = Chroma(persist_directory="Vector_db", embedding_function=embeddings)
219
+ # return vectorstore
220
+
221
+ # # Set up the chatbot chain with a specific model
222
+ # def chat_chain(vectorstore, model_name):
223
+ # llm = ChatGroq(model=model_name, temperature=0)
224
+ # retriever = vectorstore.as_retriever()
225
+ # memory = ConversationBufferMemory(
226
+ # llm=llm,
227
+ # output_key="answer",
228
+ # memory_key="chat_history",
229
+ # return_messages=True
230
+ # )
231
+ # chain = ConversationalRetrievalChain.from_llm(
232
+ # llm=llm,
233
+ # retriever=retriever,
234
+ # chain_type="stuff",
235
+ # memory=memory,
236
+ # verbose=True,
237
+ # return_source_documents=True
238
+ # )
239
+ # return chain
240
+
241
+ # # Streamlit UI setup
242
+ # st.set_page_config(page_title="ByteX-Ai", page_icon="🤖AI", layout="centered")
243
+ # st.title("🤖 ByteX-Ai")
244
+ # st.subheader("Hey! Get your Laptop!!")
245
+
246
+ # # Initialize DB connection
247
+ # if "conn" not in st.session_state:
248
+ # st.session_state.conn = setup_db()
249
+
250
+ # # Prompt user to log in
251
+ # if "username" not in st.session_state:
252
+ # username = st.text_input("Enter your name to proceed:")
253
+ # if username:
254
+ # with st.spinner("Loading chatbot interface... Please wait."):
255
+ # st.session_state.username = username
256
+ # st.session_state.chat_history = []
257
+ # st.session_state.vectorstore = setup_vectorstore()
258
+ # st.success(f"Welcome, {username}! Now select a model to start chatting.")
259
+ # else:
260
+ # username = st.session_state.username
261
+
262
+ # # Model selection options
263
+ # model_options = [
264
+ # "gemma2-9b-it",
265
+ # "llama-3.1-8b-instant",
266
+ # "llama3-70b-8192",
267
+ # "llama3-8b-8192"
268
+ # ]
269
+
270
+ # # Model dropdown
271
+ # selected_model = st.selectbox("Choose a model:", model_options)
272
+
273
+ # # Ensure vectorstore exists
274
+ # if "vectorstore" not in st.session_state:
275
+ # st.session_state.vectorstore = setup_vectorstore()
276
+
277
+ # # Set or update the selected model
278
+ # if "selected_model" not in st.session_state:
279
+ # st.session_state.selected_model = selected_model
280
+
281
+ # # Reset conversational_chain if model changes or not yet initialized
282
+ # if ("conversational_chain" not in st.session_state) or (st.session_state.selected_model != selected_model):
283
+ # st.session_state.selected_model = selected_model
284
+ # st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore, selected_model)
285
+ # st.session_state.chat_history = []
286
+
287
+ # # Reset chat manually
288
+ # if st.button("🔄 Reset Chat"):
289
+ # st.session_state.chat_history = []
290
+ # st.session_state.conversational_chain = chat_chain(st.session_state.vectorstore, st.session_state.selected_model)
291
+ # st.success("Chat reset!")
292
+
293
+
294
+ # # Show chat UI
295
+ # if "username" in st.session_state:
296
+ # st.subheader(f"Hello {username}, start your query below!")
297
+
298
+ # if st.session_state.chat_history:
299
+ # for message in st.session_state.chat_history:
300
+ # if message['role'] == 'user':
301
+ # with st.chat_message("user"):
302
+ # st.markdown(message["content"])
303
+ # elif message['role'] == 'assistant':
304
+ # with st.chat_message("assistant"):
305
+ # st.markdown(message["content"])
306
+
307
+ # user_input = st.chat_input("Ask AI....")
308
+
309
+ # if user_input:
310
+ # with st.spinner("Processing your query... Please wait."):
311
+ # st.session_state.chat_history.append({"role": "user", "content": user_input})
312
+
313
+ # with st.chat_message("user"):
314
+ # st.markdown(user_input)
315
+
316
+ # with st.chat_message("assistant"):
317
+ # response = st.session_state.conversational_chain({"question": user_input})
318
+ # assistant_response = response["answer"]
319
+ # st.markdown(assistant_response)
320
+
321
+ # st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
chat_history.db ADDED
Binary file (12.3 kB). View file
 
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GROQ_API_KEY": "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"}
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ google-generativeai
4
+ PyPDF2
5
+ fpdf
6
+ streamlit
7
+ langchain-community==0.2.16
8
+ langchain-text-splitters==0.2.4
9
+ langchain-chroma==0.1.3
10
+ langchain-huggingface==0.0.3
11
+ langchain-groq==0.1.9
12
+ unstructured==0.15.0
13
+ unstructured[pdf]==0.15.0
vectorize_documents.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_text_splitters import CharacterTextSplitter
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_chroma import Chroma
5
+ from langchain.docstore.document import Document
6
+ import pandas as pd
7
+ import os
8
+ import glob
9
+ from PyPDF2 import PdfReader # Ensure PyPDF2 is installed
10
+
11
+ # Define a function to process CSV files
12
+ def process_csv_files(csv_files):
13
+ documents = []
14
+ for file_path in csv_files:
15
+ df = pd.read_csv(file_path)
16
+ for _, row in df.iterrows():
17
+ row_content = " ".join(row.astype(str))
18
+ documents.append(Document(page_content=row_content))
19
+ return documents
20
+
21
+ # Define a function to process PDF files
22
+ def process_pdf_files(pdf_files):
23
+ documents = []
24
+ for file_path in pdf_files:
25
+ reader = PdfReader(file_path)
26
+ for page in reader.pages:
27
+ text = page.extract_text()
28
+ if text: # Only add non-empty text
29
+ documents.append(Document(page_content=text))
30
+ return documents
31
+
32
+ # Define a function to perform vectorization for CSV and PDF files
33
+ def vectorize_documents():
34
+ embeddings = HuggingFaceEmbeddings()
35
+
36
+ # Directory containing files
37
+ data_directory = "Data" # Replace with your folder name
38
+ csv_files = glob.glob(os.path.join(data_directory, "*.csv"))
39
+ pdf_files = glob.glob(os.path.join(data_directory, "*.pdf"))
40
+
41
+ # Process CSV and PDF files
42
+ documents = process_csv_files(csv_files) + process_pdf_files(pdf_files)
43
+
44
+ # Splitting the text and creating chunks of these documents
45
+ text_splitter = CharacterTextSplitter(
46
+ chunk_size=2000,
47
+ chunk_overlap=500
48
+ )
49
+
50
+ text_chunks = text_splitter.split_documents(documents)
51
+
52
+ # Process text chunks in batches
53
+ batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
54
+ for i in range(0, len(text_chunks), batch_size):
55
+ batch = text_chunks[i:i + batch_size]
56
+
57
+ # Store the batch in Chroma vector DB
58
+ vectordb = Chroma.from_documents(
59
+ documents=batch,
60
+ embedding=embeddings,
61
+ persist_directory="Vector_db"
62
+ )
63
+
64
+ print("Documents Vectorized and saved in VectorDB")
65
+
66
+ # Expose embeddings if needed
67
+ embeddings = HuggingFaceEmbeddings()
68
+
69
+ # Main guard to prevent execution on import
70
+ if __name__ == "__main__":
71
+ vectorize_documents()