Multi_Model_AI_AGENT_VectorDB_langchain_json

Sleeping

App Files Files Community

Seth0330 commited on Jun 12, 2025

Commit

b72bfb1

verified ·

1 Parent(s): 9a66ef3

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -38

app.py CHANGED Viewed

@@ -36,11 +36,13 @@ uploaded_files = st.file_uploader(
     "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
 )
 def flatten_json_obj(obj, parent_key="", sep="."):
     items = {}
     if isinstance(obj, dict):
         for k, v in obj.items():
             new_key = f"{parent_key}{sep}{k}" if parent_key else k
             if (
                 k.lower() in {"customer", "user", "email", "username"} and
                 isinstance(v, str) and "@" in v
@@ -49,8 +51,8 @@ def flatten_json_obj(obj, parent_key="", sep="."):
                 local_clean = re.sub(r'[^a-zA-Z0-9]', ' ', local)
                 parts = [part for part in local_clean.split() if part]
                 if parts:
-                    items[new_key + "_name"] = parts[0]
-                    items[new_key + "_all_names"] = " ".join(parts)
             items.update(flatten_json_obj(v, new_key, sep=sep))
     elif isinstance(obj, list):
         for i, v in enumerate(obj):
@@ -121,40 +123,6 @@ def ingest_json_files(files):
 if uploaded_files and st.button("Ingest batch to database"):
     ingest_json_files(uploaded_files)
-# --- Improved entity search/filter
-def extract_main_entity(question):
-    # crude: get the first capitalized word, or all words
-    tokens = re.findall(r"\b([A-Za-z0-9]+)\b", question)
-    keywords = [t.lower() for t in tokens if t.lower() not in {"how", "much", "did", "spend", "was", "the", "is", "in", "on", "for", "a", "an", "of", "to", "with"}]
-    # e.g. ["johnny", "spend"] → "johnny"
-    return keywords[0] if keywords else None
-def filter_records_by_entity(records, entity):
-    matches = []
-    for doc in records:
-        if entity and entity in doc.page_content.lower():
-            matches.append(doc)
-    return matches if matches else records
-def hybrid_query(user_query, top_k=5):
-    vector_docs = query_vector_db(user_query, top_k=top_k)
-    fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
-    all_docs = []
-    seen_ids = set()
-    for doc in (vector_docs + fuzzy_docs):
-        doc_id = doc.metadata.get("id")
-        if doc_id not in seen_ids:
-            all_docs.append(doc)
-            seen_ids.add(doc_id)
-    # Filter for entity match if possible
-    entity = extract_main_entity(user_query)
-    entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
-    # Optionally, highlight the entity in the flat_text for the LLM
-    for doc in entity_docs:
-        if entity:
-            doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
-    return entity_docs[:top_k]
 def query_vector_db(user_query, top_k=5):
     query_emb = get_embedding(user_query)
     conn = sqlite3.connect(DB_PATH)
@@ -205,6 +173,42 @@ def python_fuzzy_match(user_query, top_k=5):
         docs.append(Document(page_content=row[4], metadata=meta))
     return docs
 class HybridRetriever(BaseRetriever):
     top_k: int = Field(default=5)
     def _get_relevant_documents(self, query, run_manager=None, **kwargs):
@@ -213,8 +217,10 @@ class HybridRetriever(BaseRetriever):
 # --- Prompt (explicitly tells LLM what to do)
 system_prompt = (
     "You are a JSON data assistant. "
-    "If a question mentions a name (like Johnny), find any record where that name appears as part of any field value (including emails or usernames). "
-    "Use the provided records to answer directly. If you can't find the answer, reply: 'I don’t have that information.' "
     "Never make up data. Never ask for clarification."
 )
 prompt = ChatPromptTemplate.from_messages([

     "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
 )
+# --- Enhanced Flattening: extract names from emails/user fields for LLM context
 def flatten_json_obj(obj, parent_key="", sep="."):
     items = {}
     if isinstance(obj, dict):
         for k, v in obj.items():
             new_key = f"{parent_key}{sep}{k}" if parent_key else k
+            # If this is a customer/email field, extract name!
             if (
                 k.lower() in {"customer", "user", "email", "username"} and
                 isinstance(v, str) and "@" in v
                 local_clean = re.sub(r'[^a-zA-Z0-9]', ' ', local)
                 parts = [part for part in local_clean.split() if part]
                 if parts:
+                    items[new_key + "_name"] = parts[0].lower()
+                    items[new_key + "_all_names"] = " ".join(parts).lower()
             items.update(flatten_json_obj(v, new_key, sep=sep))
     elif isinstance(obj, list):
         for i, v in enumerate(obj):
 if uploaded_files and st.button("Ingest batch to database"):
     ingest_json_files(uploaded_files)
 def query_vector_db(user_query, top_k=5):
     query_emb = get_embedding(user_query)
     conn = sqlite3.connect(DB_PATH)
         docs.append(Document(page_content=row[4], metadata=meta))
     return docs
+def extract_main_entity(question):
+    tokens = re.findall(r"\b([A-Za-z0-9]+)\b", question)
+    keywords = [t.lower() for t in tokens if t.lower() not in {"how", "much", "did", "spend", "was", "the", "is", "in", "on", "for", "a", "an", "of", "to", "with"}]
+    return keywords[0] if keywords else None
+def filter_records_by_entity(records, entity):
+    matches = []
+    for doc in records:
+        if entity and entity in doc.page_content.lower():
+            matches.append(doc)
+    return matches if matches else records
+def hybrid_query(user_query, top_k=5):
+    vector_docs = query_vector_db(user_query, top_k=top_k)
+    fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
+    all_docs = []
+    seen_ids = set()
+    for doc in (vector_docs + fuzzy_docs):
+        doc_id = doc.metadata.get("id")
+        if doc_id not in seen_ids:
+            all_docs.append(doc)
+            seen_ids.add(doc_id)
+    entity = extract_main_entity(user_query)
+    entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
+    # Show only the most relevant record (to make LLM's job easier)
+    if entity_docs:
+        doc = entity_docs[0]
+        # Optionally, highlight entity in context
+        if entity:
+            doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
+        st.markdown("#### Context shown to LLM")
+        st.code(doc.page_content)
+        return [doc]
+    else:
+        return all_docs[:1]
 class HybridRetriever(BaseRetriever):
     top_k: int = Field(default=5)
     def _get_relevant_documents(self, query, run_manager=None, **kwargs):
 # --- Prompt (explicitly tells LLM what to do)
 system_prompt = (
     "You are a JSON data assistant. "
+    "If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
+    "and answer directly using the record's fields. "
+    "For example, if 'customer: johnny.appleseed@gmail.com' and the question is about Johnny, you should use that record."
+    "If you can't find the answer, reply: 'I don’t have that information.'"
     "Never make up data. Never ask for clarification."
 )
 prompt = ChatPromptTemplate.from_messages([