Seth0330 commited on
Commit
b72bfb1
·
verified ·
1 Parent(s): 9a66ef3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -38
app.py CHANGED
@@ -36,11 +36,13 @@ uploaded_files = st.file_uploader(
36
  "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
37
  )
38
 
 
39
  def flatten_json_obj(obj, parent_key="", sep="."):
40
  items = {}
41
  if isinstance(obj, dict):
42
  for k, v in obj.items():
43
  new_key = f"{parent_key}{sep}{k}" if parent_key else k
 
44
  if (
45
  k.lower() in {"customer", "user", "email", "username"} and
46
  isinstance(v, str) and "@" in v
@@ -49,8 +51,8 @@ def flatten_json_obj(obj, parent_key="", sep="."):
49
  local_clean = re.sub(r'[^a-zA-Z0-9]', ' ', local)
50
  parts = [part for part in local_clean.split() if part]
51
  if parts:
52
- items[new_key + "_name"] = parts[0]
53
- items[new_key + "_all_names"] = " ".join(parts)
54
  items.update(flatten_json_obj(v, new_key, sep=sep))
55
  elif isinstance(obj, list):
56
  for i, v in enumerate(obj):
@@ -121,40 +123,6 @@ def ingest_json_files(files):
121
  if uploaded_files and st.button("Ingest batch to database"):
122
  ingest_json_files(uploaded_files)
123
 
124
- # --- Improved entity search/filter
125
- def extract_main_entity(question):
126
- # crude: get the first capitalized word, or all words
127
- tokens = re.findall(r"\b([A-Za-z0-9]+)\b", question)
128
- keywords = [t.lower() for t in tokens if t.lower() not in {"how", "much", "did", "spend", "was", "the", "is", "in", "on", "for", "a", "an", "of", "to", "with"}]
129
- # e.g. ["johnny", "spend"] → "johnny"
130
- return keywords[0] if keywords else None
131
-
132
- def filter_records_by_entity(records, entity):
133
- matches = []
134
- for doc in records:
135
- if entity and entity in doc.page_content.lower():
136
- matches.append(doc)
137
- return matches if matches else records
138
-
139
- def hybrid_query(user_query, top_k=5):
140
- vector_docs = query_vector_db(user_query, top_k=top_k)
141
- fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
142
- all_docs = []
143
- seen_ids = set()
144
- for doc in (vector_docs + fuzzy_docs):
145
- doc_id = doc.metadata.get("id")
146
- if doc_id not in seen_ids:
147
- all_docs.append(doc)
148
- seen_ids.add(doc_id)
149
- # Filter for entity match if possible
150
- entity = extract_main_entity(user_query)
151
- entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
152
- # Optionally, highlight the entity in the flat_text for the LLM
153
- for doc in entity_docs:
154
- if entity:
155
- doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
156
- return entity_docs[:top_k]
157
-
158
  def query_vector_db(user_query, top_k=5):
159
  query_emb = get_embedding(user_query)
160
  conn = sqlite3.connect(DB_PATH)
@@ -205,6 +173,42 @@ def python_fuzzy_match(user_query, top_k=5):
205
  docs.append(Document(page_content=row[4], metadata=meta))
206
  return docs
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  class HybridRetriever(BaseRetriever):
209
  top_k: int = Field(default=5)
210
  def _get_relevant_documents(self, query, run_manager=None, **kwargs):
@@ -213,8 +217,10 @@ class HybridRetriever(BaseRetriever):
213
  # --- Prompt (explicitly tells LLM what to do)
214
  system_prompt = (
215
  "You are a JSON data assistant. "
216
- "If a question mentions a name (like Johnny), find any record where that name appears as part of any field value (including emails or usernames). "
217
- "Use the provided records to answer directly. If you can't find the answer, reply: 'I don’t have that information.' "
 
 
218
  "Never make up data. Never ask for clarification."
219
  )
220
  prompt = ChatPromptTemplate.from_messages([
 
36
  "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
37
  )
38
 
39
+ # --- Enhanced Flattening: extract names from emails/user fields for LLM context
40
  def flatten_json_obj(obj, parent_key="", sep="."):
41
  items = {}
42
  if isinstance(obj, dict):
43
  for k, v in obj.items():
44
  new_key = f"{parent_key}{sep}{k}" if parent_key else k
45
+ # If this is a customer/email field, extract name!
46
  if (
47
  k.lower() in {"customer", "user", "email", "username"} and
48
  isinstance(v, str) and "@" in v
 
51
  local_clean = re.sub(r'[^a-zA-Z0-9]', ' ', local)
52
  parts = [part for part in local_clean.split() if part]
53
  if parts:
54
+ items[new_key + "_name"] = parts[0].lower()
55
+ items[new_key + "_all_names"] = " ".join(parts).lower()
56
  items.update(flatten_json_obj(v, new_key, sep=sep))
57
  elif isinstance(obj, list):
58
  for i, v in enumerate(obj):
 
123
  if uploaded_files and st.button("Ingest batch to database"):
124
  ingest_json_files(uploaded_files)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def query_vector_db(user_query, top_k=5):
127
  query_emb = get_embedding(user_query)
128
  conn = sqlite3.connect(DB_PATH)
 
173
  docs.append(Document(page_content=row[4], metadata=meta))
174
  return docs
175
 
176
+ def extract_main_entity(question):
177
+ tokens = re.findall(r"\b([A-Za-z0-9]+)\b", question)
178
+ keywords = [t.lower() for t in tokens if t.lower() not in {"how", "much", "did", "spend", "was", "the", "is", "in", "on", "for", "a", "an", "of", "to", "with"}]
179
+ return keywords[0] if keywords else None
180
+
181
+ def filter_records_by_entity(records, entity):
182
+ matches = []
183
+ for doc in records:
184
+ if entity and entity in doc.page_content.lower():
185
+ matches.append(doc)
186
+ return matches if matches else records
187
+
188
+ def hybrid_query(user_query, top_k=5):
189
+ vector_docs = query_vector_db(user_query, top_k=top_k)
190
+ fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
191
+ all_docs = []
192
+ seen_ids = set()
193
+ for doc in (vector_docs + fuzzy_docs):
194
+ doc_id = doc.metadata.get("id")
195
+ if doc_id not in seen_ids:
196
+ all_docs.append(doc)
197
+ seen_ids.add(doc_id)
198
+ entity = extract_main_entity(user_query)
199
+ entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
200
+ # Show only the most relevant record (to make LLM's job easier)
201
+ if entity_docs:
202
+ doc = entity_docs[0]
203
+ # Optionally, highlight entity in context
204
+ if entity:
205
+ doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
206
+ st.markdown("#### Context shown to LLM")
207
+ st.code(doc.page_content)
208
+ return [doc]
209
+ else:
210
+ return all_docs[:1]
211
+
212
  class HybridRetriever(BaseRetriever):
213
  top_k: int = Field(default=5)
214
  def _get_relevant_documents(self, query, run_manager=None, **kwargs):
 
217
  # --- Prompt (explicitly tells LLM what to do)
218
  system_prompt = (
219
  "You are a JSON data assistant. "
220
+ "If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
221
+ "and answer directly using the record's fields. "
222
+ "For example, if 'customer: johnny.appleseed@gmail.com' and the question is about Johnny, you should use that record."
223
+ "If you can't find the answer, reply: 'I don’t have that information.'"
224
  "Never make up data. Never ask for clarification."
225
  )
226
  prompt = ChatPromptTemplate.from_messages([