Update app.py
Browse files
app.py
CHANGED
|
@@ -36,11 +36,13 @@ uploaded_files = st.file_uploader(
|
|
| 36 |
"Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
|
| 37 |
)
|
| 38 |
|
|
|
|
| 39 |
def flatten_json_obj(obj, parent_key="", sep="."):
|
| 40 |
items = {}
|
| 41 |
if isinstance(obj, dict):
|
| 42 |
for k, v in obj.items():
|
| 43 |
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
|
|
| 44 |
if (
|
| 45 |
k.lower() in {"customer", "user", "email", "username"} and
|
| 46 |
isinstance(v, str) and "@" in v
|
|
@@ -49,8 +51,8 @@ def flatten_json_obj(obj, parent_key="", sep="."):
|
|
| 49 |
local_clean = re.sub(r'[^a-zA-Z0-9]', ' ', local)
|
| 50 |
parts = [part for part in local_clean.split() if part]
|
| 51 |
if parts:
|
| 52 |
-
items[new_key + "_name"] = parts[0]
|
| 53 |
-
items[new_key + "_all_names"] = " ".join(parts)
|
| 54 |
items.update(flatten_json_obj(v, new_key, sep=sep))
|
| 55 |
elif isinstance(obj, list):
|
| 56 |
for i, v in enumerate(obj):
|
|
@@ -121,40 +123,6 @@ def ingest_json_files(files):
|
|
| 121 |
if uploaded_files and st.button("Ingest batch to database"):
|
| 122 |
ingest_json_files(uploaded_files)
|
| 123 |
|
| 124 |
-
# --- Improved entity search/filter
|
| 125 |
-
def extract_main_entity(question):
|
| 126 |
-
# crude: get the first capitalized word, or all words
|
| 127 |
-
tokens = re.findall(r"\b([A-Za-z0-9]+)\b", question)
|
| 128 |
-
keywords = [t.lower() for t in tokens if t.lower() not in {"how", "much", "did", "spend", "was", "the", "is", "in", "on", "for", "a", "an", "of", "to", "with"}]
|
| 129 |
-
# e.g. ["johnny", "spend"] → "johnny"
|
| 130 |
-
return keywords[0] if keywords else None
|
| 131 |
-
|
| 132 |
-
def filter_records_by_entity(records, entity):
|
| 133 |
-
matches = []
|
| 134 |
-
for doc in records:
|
| 135 |
-
if entity and entity in doc.page_content.lower():
|
| 136 |
-
matches.append(doc)
|
| 137 |
-
return matches if matches else records
|
| 138 |
-
|
| 139 |
-
def hybrid_query(user_query, top_k=5):
|
| 140 |
-
vector_docs = query_vector_db(user_query, top_k=top_k)
|
| 141 |
-
fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
|
| 142 |
-
all_docs = []
|
| 143 |
-
seen_ids = set()
|
| 144 |
-
for doc in (vector_docs + fuzzy_docs):
|
| 145 |
-
doc_id = doc.metadata.get("id")
|
| 146 |
-
if doc_id not in seen_ids:
|
| 147 |
-
all_docs.append(doc)
|
| 148 |
-
seen_ids.add(doc_id)
|
| 149 |
-
# Filter for entity match if possible
|
| 150 |
-
entity = extract_main_entity(user_query)
|
| 151 |
-
entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
|
| 152 |
-
# Optionally, highlight the entity in the flat_text for the LLM
|
| 153 |
-
for doc in entity_docs:
|
| 154 |
-
if entity:
|
| 155 |
-
doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
|
| 156 |
-
return entity_docs[:top_k]
|
| 157 |
-
|
| 158 |
def query_vector_db(user_query, top_k=5):
|
| 159 |
query_emb = get_embedding(user_query)
|
| 160 |
conn = sqlite3.connect(DB_PATH)
|
|
@@ -205,6 +173,42 @@ def python_fuzzy_match(user_query, top_k=5):
|
|
| 205 |
docs.append(Document(page_content=row[4], metadata=meta))
|
| 206 |
return docs
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
class HybridRetriever(BaseRetriever):
|
| 209 |
top_k: int = Field(default=5)
|
| 210 |
def _get_relevant_documents(self, query, run_manager=None, **kwargs):
|
|
@@ -213,8 +217,10 @@ class HybridRetriever(BaseRetriever):
|
|
| 213 |
# --- Prompt (explicitly tells LLM what to do)
|
| 214 |
system_prompt = (
|
| 215 |
"You are a JSON data assistant. "
|
| 216 |
-
"If
|
| 217 |
-
"
|
|
|
|
|
|
|
| 218 |
"Never make up data. Never ask for clarification."
|
| 219 |
)
|
| 220 |
prompt = ChatPromptTemplate.from_messages([
|
|
|
|
| 36 |
"Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
|
| 37 |
)
|
| 38 |
|
| 39 |
+
# --- Enhanced Flattening: extract names from emails/user fields for LLM context
|
| 40 |
def flatten_json_obj(obj, parent_key="", sep="."):
|
| 41 |
items = {}
|
| 42 |
if isinstance(obj, dict):
|
| 43 |
for k, v in obj.items():
|
| 44 |
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
| 45 |
+
# If this is a customer/email field, extract name!
|
| 46 |
if (
|
| 47 |
k.lower() in {"customer", "user", "email", "username"} and
|
| 48 |
isinstance(v, str) and "@" in v
|
|
|
|
| 51 |
local_clean = re.sub(r'[^a-zA-Z0-9]', ' ', local)
|
| 52 |
parts = [part for part in local_clean.split() if part]
|
| 53 |
if parts:
|
| 54 |
+
items[new_key + "_name"] = parts[0].lower()
|
| 55 |
+
items[new_key + "_all_names"] = " ".join(parts).lower()
|
| 56 |
items.update(flatten_json_obj(v, new_key, sep=sep))
|
| 57 |
elif isinstance(obj, list):
|
| 58 |
for i, v in enumerate(obj):
|
|
|
|
| 123 |
if uploaded_files and st.button("Ingest batch to database"):
|
| 124 |
ingest_json_files(uploaded_files)
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
def query_vector_db(user_query, top_k=5):
|
| 127 |
query_emb = get_embedding(user_query)
|
| 128 |
conn = sqlite3.connect(DB_PATH)
|
|
|
|
| 173 |
docs.append(Document(page_content=row[4], metadata=meta))
|
| 174 |
return docs
|
| 175 |
|
| 176 |
+
def extract_main_entity(question):
|
| 177 |
+
tokens = re.findall(r"\b([A-Za-z0-9]+)\b", question)
|
| 178 |
+
keywords = [t.lower() for t in tokens if t.lower() not in {"how", "much", "did", "spend", "was", "the", "is", "in", "on", "for", "a", "an", "of", "to", "with"}]
|
| 179 |
+
return keywords[0] if keywords else None
|
| 180 |
+
|
| 181 |
+
def filter_records_by_entity(records, entity):
|
| 182 |
+
matches = []
|
| 183 |
+
for doc in records:
|
| 184 |
+
if entity and entity in doc.page_content.lower():
|
| 185 |
+
matches.append(doc)
|
| 186 |
+
return matches if matches else records
|
| 187 |
+
|
| 188 |
+
def hybrid_query(user_query, top_k=5):
|
| 189 |
+
vector_docs = query_vector_db(user_query, top_k=top_k)
|
| 190 |
+
fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
|
| 191 |
+
all_docs = []
|
| 192 |
+
seen_ids = set()
|
| 193 |
+
for doc in (vector_docs + fuzzy_docs):
|
| 194 |
+
doc_id = doc.metadata.get("id")
|
| 195 |
+
if doc_id not in seen_ids:
|
| 196 |
+
all_docs.append(doc)
|
| 197 |
+
seen_ids.add(doc_id)
|
| 198 |
+
entity = extract_main_entity(user_query)
|
| 199 |
+
entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
|
| 200 |
+
# Show only the most relevant record (to make LLM's job easier)
|
| 201 |
+
if entity_docs:
|
| 202 |
+
doc = entity_docs[0]
|
| 203 |
+
# Optionally, highlight entity in context
|
| 204 |
+
if entity:
|
| 205 |
+
doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
|
| 206 |
+
st.markdown("#### Context shown to LLM")
|
| 207 |
+
st.code(doc.page_content)
|
| 208 |
+
return [doc]
|
| 209 |
+
else:
|
| 210 |
+
return all_docs[:1]
|
| 211 |
+
|
| 212 |
class HybridRetriever(BaseRetriever):
|
| 213 |
top_k: int = Field(default=5)
|
| 214 |
def _get_relevant_documents(self, query, run_manager=None, **kwargs):
|
|
|
|
| 217 |
# --- Prompt (explicitly tells LLM what to do)
|
| 218 |
system_prompt = (
|
| 219 |
"You are a JSON data assistant. "
|
| 220 |
+
"If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
|
| 221 |
+
"and answer directly using the record's fields. "
|
| 222 |
+
"For example, if 'customer: johnny.appleseed@gmail.com' and the question is about Johnny, you should use that record."
|
| 223 |
+
"If you can't find the answer, reply: 'I don’t have that information.'"
|
| 224 |
"Never make up data. Never ask for clarification."
|
| 225 |
)
|
| 226 |
prompt = ChatPromptTemplate.from_messages([
|