Seth0330 commited on
Commit
9a5dc6b
·
verified ·
1 Parent(s): f5c02d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -14
app.py CHANGED
@@ -22,21 +22,15 @@ if "ingested_batches" not in st.session_state:
22
  st.session_state.ingested_batches = 0
23
  if "messages" not in st.session_state:
24
  st.session_state.messages = []
25
- if "modal_open" not in st.session_state:
26
- st.session_state.modal_open = False
27
- if "modal_content" not in st.session_state:
28
- st.session_state.modal_content = ""
29
- if "modal_title" not in st.session_state:
30
- st.session_state.modal_title = ""
31
 
32
- st.set_page_config(page_title="Chat with Your JSON Vectors", layout="wide")
33
- st.title("Chat with Your Vectorized JSON Files (Hybrid Retrieval, SQLite, LLM)")
34
 
35
  uploaded_files = st.file_uploader(
36
  "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
37
  )
38
 
39
- # --- Enhanced Flattening: extract names from emails/user fields for LLM context
40
  def flatten_json_obj(obj, parent_key="", sep="."):
41
  items = {}
42
  if isinstance(obj, dict):
@@ -62,6 +56,25 @@ def flatten_json_obj(obj, parent_key="", sep="."):
62
  items[parent_key] = obj
63
  return items
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def get_embedding(text):
66
  client = openai.OpenAI(api_key=OPENAI_API_KEY)
67
  response = client.embeddings.create(input=[text], model=EMBEDDING_MODEL)
@@ -88,6 +101,7 @@ def ingest_json_files(files):
88
  rows = []
89
  batch_time = datetime.datetime.utcnow().isoformat()
90
  for file in files:
 
91
  raw = json.load(file)
92
  source_name = file.name
93
  if isinstance(raw, list):
@@ -199,7 +213,6 @@ def filter_records_by_entity(records, entity):
199
  matches.append(doc)
200
  return matches if matches else records
201
 
202
-
203
  def hybrid_query(user_query, top_k=5):
204
  vector_docs = query_vector_db(user_query, top_k=top_k)
205
  fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
@@ -211,11 +224,13 @@ def hybrid_query(user_query, top_k=5):
211
  all_docs.append(doc)
212
  seen_ids.add(doc_id)
213
  entity = extract_main_entity(user_query)
 
 
 
 
214
  entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
215
- # Show only the most relevant record (to make LLM's job easier)
216
  if entity_docs:
217
  doc = entity_docs[0]
218
- # Optionally, highlight entity in context
219
  if entity:
220
  doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
221
  st.markdown("#### Context shown to LLM")
@@ -229,7 +244,6 @@ class HybridRetriever(BaseRetriever):
229
  def _get_relevant_documents(self, query, run_manager=None, **kwargs):
230
  return hybrid_query(query, self.top_k)
231
 
232
- # --- Prompt (explicitly tells LLM what to do)
233
  system_prompt = (
234
  "You are a JSON data assistant. "
235
  "If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
@@ -244,7 +258,6 @@ prompt = ChatPromptTemplate.from_messages([
244
  ])
245
 
246
  llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
247
-
248
  retriever = HybridRetriever(top_k=5)
249
  qa_chain = RetrievalQA.from_chain_type(
250
  llm=llm,
 
22
  st.session_state.ingested_batches = 0
23
  if "messages" not in st.session_state:
24
  st.session_state.messages = []
 
 
 
 
 
 
25
 
26
+ st.set_page_config(page_title="Chat with Your JSON Vectors (DEBUG)", layout="wide")
27
+ st.title("Chat with Your Vectorized JSON Files (Hybrid Retrieval, SQLite, LLM, DEBUG)")
28
 
29
  uploaded_files = st.file_uploader(
30
  "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
31
  )
32
 
33
+ # --- Enhanced flattening with name extraction for emails/user fields
34
  def flatten_json_obj(obj, parent_key="", sep="."):
35
  items = {}
36
  if isinstance(obj, dict):
 
56
  items[parent_key] = obj
57
  return items
58
 
59
+ # --- DEBUG: Show flattening of uploaded JSONs
60
+ if uploaded_files:
61
+ st.markdown("#### DEBUG: Flat view of all uploaded JSON records")
62
+ for file in uploaded_files:
63
+ file.seek(0)
64
+ try:
65
+ raw = json.load(file)
66
+ if isinstance(raw, list):
67
+ records = raw
68
+ elif isinstance(raw, dict):
69
+ main_lists = [v for v in raw.values() if isinstance(v, list)]
70
+ records = main_lists[0] if main_lists else [raw]
71
+ else:
72
+ records = [raw]
73
+ for idx, rec in enumerate(records):
74
+ st.code(flatten_json_obj(rec))
75
+ except Exception as e:
76
+ st.warning(str(e))
77
+
78
  def get_embedding(text):
79
  client = openai.OpenAI(api_key=OPENAI_API_KEY)
80
  response = client.embeddings.create(input=[text], model=EMBEDDING_MODEL)
 
101
  rows = []
102
  batch_time = datetime.datetime.utcnow().isoformat()
103
  for file in files:
104
+ file.seek(0)
105
  raw = json.load(file)
106
  source_name = file.name
107
  if isinstance(raw, list):
 
213
  matches.append(doc)
214
  return matches if matches else records
215
 
 
216
  def hybrid_query(user_query, top_k=5):
217
  vector_docs = query_vector_db(user_query, top_k=top_k)
218
  fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
 
224
  all_docs.append(doc)
225
  seen_ids.add(doc_id)
226
  entity = extract_main_entity(user_query)
227
+ st.markdown(f"#### DEBUG: Extracted entity from question: {entity}")
228
+ st.markdown("#### DEBUG: All retrieved docs for your query")
229
+ for idx, doc in enumerate(all_docs):
230
+ st.code(doc.page_content)
231
  entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
 
232
  if entity_docs:
233
  doc = entity_docs[0]
 
234
  if entity:
235
  doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
236
  st.markdown("#### Context shown to LLM")
 
244
  def _get_relevant_documents(self, query, run_manager=None, **kwargs):
245
  return hybrid_query(query, self.top_k)
246
 
 
247
  system_prompt = (
248
  "You are a JSON data assistant. "
249
  "If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
 
258
  ])
259
 
260
  llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
 
261
  retriever = HybridRetriever(top_k=5)
262
  qa_chain = RetrievalQA.from_chain_type(
263
  llm=llm,