Seth0330 commited on
Commit
86eb190
·
verified ·
1 Parent(s): 3a352a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -11
app.py CHANGED
@@ -6,7 +6,6 @@ import sqlite3
6
  import json
7
  import numpy as np
8
  import datetime
9
-
10
  from langchain.chains import RetrievalQA
11
  from langchain.schema import Document
12
  from langchain_core.retrievers import BaseRetriever
@@ -32,17 +31,25 @@ if "modal_title" not in st.session_state:
32
  st.session_state.modal_title = ""
33
 
34
  st.set_page_config(page_title="Chat with Your JSON Vectors", layout="wide")
35
- st.title("Chat with Your Vectorized JSON Files (LangChain, SQLite, LLM)")
36
 
37
  uploaded_files = st.file_uploader(
38
  "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
39
  )
40
 
 
41
  def flatten_json_obj(obj, parent_key="", sep="."):
42
  items = {}
43
  if isinstance(obj, dict):
44
  for k, v in obj.items():
45
  new_key = f"{parent_key}{sep}{k}" if parent_key else k
 
 
 
 
 
 
 
46
  items.update(flatten_json_obj(v, new_key, sep=sep))
47
  elif isinstance(obj, list):
48
  for i, v in enumerate(obj):
@@ -113,6 +120,7 @@ def ingest_json_files(files):
113
  if uploaded_files and st.button("Ingest batch to database"):
114
  ingest_json_files(uploaded_files)
115
 
 
116
  def query_vector_db(user_query, top_k=5):
117
  query_emb = get_embedding(user_query)
118
  conn = sqlite3.connect(DB_PATH)
@@ -132,24 +140,63 @@ def query_vector_db(user_query, top_k=5):
132
  "id": row[0],
133
  "batch_time": str(row[1]),
134
  "source_file": row[2],
135
- "similarity": f"{sim:.4f}",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  "raw_json": row[3],
137
  }
138
  docs.append(Document(page_content=row[4], metadata=meta))
139
  return docs
140
 
141
- class SQLiteVectorRetriever(BaseRetriever):
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  top_k: int = Field(default=5)
143
  def _get_relevant_documents(self, query, run_manager=None, **kwargs):
144
- return query_vector_db(query, self.top_k)
145
 
146
- # --- SYSTEM PROMPT & CORRECT PROMPT FORMAT ---
147
  system_prompt = (
148
  "You are a JSON data assistant. Always give a direct, concise answer based only on the context provided. "
149
  "If you do not see the answer in the context, reply: 'I don’t have that information.' "
150
  "Never make up information. Never ask for clarification."
151
  )
152
-
153
  prompt = ChatPromptTemplate.from_messages([
154
  ("system", system_prompt),
155
  ("human", "Context:\n{context}\n\nQuestion: {question}")
@@ -157,7 +204,7 @@ prompt = ChatPromptTemplate.from_messages([
157
 
158
  llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
159
 
160
- retriever = SQLiteVectorRetriever(top_k=5)
161
  qa_chain = RetrievalQA.from_chain_type(
162
  llm=llm,
163
  retriever=retriever,
@@ -165,7 +212,7 @@ qa_chain = RetrievalQA.from_chain_type(
165
  return_source_documents=True,
166
  )
167
 
168
- # --- Conversation Area (fine-tuned style) ---
169
  st.markdown("### Ask any question about your data, just like ChatGPT.")
170
  for msg in st.session_state.messages:
171
  if msg["role"] == "user":
@@ -204,7 +251,7 @@ def send_message():
204
  return
205
  st.session_state.messages.append({"role": "user", "content": user_input})
206
  with st.spinner("Thinking..."):
207
- # Correct input key: "query" (not "question")
208
  result = qa_chain({"query": user_input})
209
  answer = result['result']
210
  st.session_state.messages.append({"role": "assistant", "content": answer})
@@ -214,12 +261,12 @@ def send_message():
214
  doc_list.append({
215
  "file": doc.metadata["source_file"],
216
  "id": doc.metadata["id"],
 
217
  "record": json.loads(doc.metadata["raw_json"])
218
  })
219
  st.session_state.messages.append({"role": "function", "content": json.dumps(doc_list, indent=2)})
220
  st.session_state.temp_input = ""
221
 
222
-
223
  st.text_input("Your message:", key="temp_input", on_change=send_message)
224
 
225
  if st.button("Clear chat"):
 
6
  import json
7
  import numpy as np
8
  import datetime
 
9
  from langchain.chains import RetrievalQA
10
  from langchain.schema import Document
11
  from langchain_core.retrievers import BaseRetriever
 
31
  st.session_state.modal_title = ""
32
 
33
  st.set_page_config(page_title="Chat with Your JSON Vectors", layout="wide")
34
+ st.title("Chat with Your Vectorized JSON Files (Hybrid Retrieval, SQLite, LLM)")
35
 
36
  uploaded_files = st.file_uploader(
37
  "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
38
  )
39
 
40
+ # --- Improved Flattening: extracts entity from emails/user fields for better matching
41
  def flatten_json_obj(obj, parent_key="", sep="."):
42
  items = {}
43
  if isinstance(obj, dict):
44
  for k, v in obj.items():
45
  new_key = f"{parent_key}{sep}{k}" if parent_key else k
46
+ # Entity extraction: add name from email
47
+ if (
48
+ k.lower() in {"customer", "user", "email", "username"} and
49
+ isinstance(v, str) and "@" in v
50
+ ):
51
+ local = v.split("@")[0]
52
+ items[new_key + "_name"] = local
53
  items.update(flatten_json_obj(v, new_key, sep=sep))
54
  elif isinstance(obj, list):
55
  for i, v in enumerate(obj):
 
120
  if uploaded_files and st.button("Ingest batch to database"):
121
  ingest_json_files(uploaded_files)
122
 
123
+ # --- VECTOR RETRIEVAL
124
  def query_vector_db(user_query, top_k=5):
125
  query_emb = get_embedding(user_query)
126
  conn = sqlite3.connect(DB_PATH)
 
140
  "id": row[0],
141
  "batch_time": str(row[1]),
142
  "source_file": row[2],
143
+ "similarity": f"{sim:.4f} (embedding)",
144
+ "raw_json": row[3],
145
+ }
146
+ docs.append(Document(page_content=row[4], metadata=meta))
147
+ return docs
148
+
149
+ # --- PYTHON FUZZY/KEYWORD SEARCH
150
+ def python_fuzzy_match(user_query, top_k=5):
151
+ query_terms = set(user_query.lower().replace("@", " ").replace(".", " ").split())
152
+ conn = sqlite3.connect(DB_PATH)
153
+ cursor = conn.cursor()
154
+ cursor.execute("SELECT id, batch_time, source_file, raw_json, flat_text FROM json_records")
155
+ results = []
156
+ for row in cursor.fetchall():
157
+ flat_text = row[4].lower()
158
+ # score = # of query terms present as substring in the flat_text
159
+ score = sum(any(term in flat_text for term in query_terms) for term in query_terms)
160
+ if score > 0:
161
+ results.append((score, row))
162
+ conn.close()
163
+ results = sorted(results, reverse=True)[:top_k]
164
+ docs = []
165
+ for score, row in results:
166
+ meta = {
167
+ "id": row[0],
168
+ "batch_time": str(row[1]),
169
+ "source_file": row[2],
170
+ "similarity": f"{score} (fuzzy)",
171
  "raw_json": row[3],
172
  }
173
  docs.append(Document(page_content=row[4], metadata=meta))
174
  return docs
175
 
176
+ # --- HYBRID RETRIEVER
177
+ def hybrid_query(user_query, top_k=5):
178
+ vector_docs = query_vector_db(user_query, top_k=top_k)
179
+ fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
180
+ seen_ids = set()
181
+ all_docs = []
182
+ for doc in (vector_docs + fuzzy_docs):
183
+ doc_id = doc.metadata.get("id")
184
+ if doc_id not in seen_ids:
185
+ all_docs.append(doc)
186
+ seen_ids.add(doc_id)
187
+ return all_docs[:top_k]
188
+
189
+ class HybridRetriever(BaseRetriever):
190
  top_k: int = Field(default=5)
191
  def _get_relevant_documents(self, query, run_manager=None, **kwargs):
192
+ return hybrid_query(query, self.top_k)
193
 
194
+ # --- SYSTEM PROMPT & PROMPT TEMPLATE
195
  system_prompt = (
196
  "You are a JSON data assistant. Always give a direct, concise answer based only on the context provided. "
197
  "If you do not see the answer in the context, reply: 'I don’t have that information.' "
198
  "Never make up information. Never ask for clarification."
199
  )
 
200
  prompt = ChatPromptTemplate.from_messages([
201
  ("system", system_prompt),
202
  ("human", "Context:\n{context}\n\nQuestion: {question}")
 
204
 
205
  llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
206
 
207
+ retriever = HybridRetriever(top_k=5)
208
  qa_chain = RetrievalQA.from_chain_type(
209
  llm=llm,
210
  retriever=retriever,
 
212
  return_source_documents=True,
213
  )
214
 
215
+ # --- Chat UI and Conversation Area ---
216
  st.markdown("### Ask any question about your data, just like ChatGPT.")
217
  for msg in st.session_state.messages:
218
  if msg["role"] == "user":
 
251
  return
252
  st.session_state.messages.append({"role": "user", "content": user_input})
253
  with st.spinner("Thinking..."):
254
+ # Correct key: "query"
255
  result = qa_chain({"query": user_input})
256
  answer = result['result']
257
  st.session_state.messages.append({"role": "assistant", "content": answer})
 
261
  doc_list.append({
262
  "file": doc.metadata["source_file"],
263
  "id": doc.metadata["id"],
264
+ "similarity": doc.metadata["similarity"],
265
  "record": json.loads(doc.metadata["raw_json"])
266
  })
267
  st.session_state.messages.append({"role": "function", "content": json.dumps(doc_list, indent=2)})
268
  st.session_state.temp_input = ""
269
 
 
270
  st.text_input("Your message:", key="temp_input", on_change=send_message)
271
 
272
  if st.button("Clear chat"):