Seth0330 commited on
Commit
7dee79b
·
verified ·
1 Parent(s): 2dd2c85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -189
app.py CHANGED
@@ -1,23 +1,25 @@
1
- import os
2
  import streamlit as st
3
- import pandas as pd
4
- import openai
5
- import sqlite3
6
  import json
 
 
 
7
  import numpy as np
8
  import datetime
9
- import re
10
- from langchain.chains import RetrievalQA
11
  from langchain.schema import Document
12
- from langchain_core.retrievers import BaseRetriever
13
- from pydantic import Field
14
- from langchain_openai import ChatOpenAI
15
- from langchain.prompts import ChatPromptTemplate
16
 
17
- DB_PATH = "json_vector.db"
18
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
19
  EMBEDDING_MODEL = "text-embedding-ada-002"
 
20
 
 
 
 
21
  if "ingested_batches" not in st.session_state:
22
  st.session_state.ingested_batches = 0
23
  if "messages" not in st.session_state:
@@ -26,29 +28,18 @@ if "json_links" not in st.session_state:
26
  st.session_state.json_links = []
27
  if "json_link_details" not in st.session_state:
28
  st.session_state.json_link_details = {}
 
 
 
 
29
 
30
- st.set_page_config(page_title="Chat with Your JSON Vectors (Hybrid, Clean)", layout="wide")
31
- st.title("Chat with Your Vectorized JSON Files")
32
-
33
- uploaded_files = st.file_uploader(
34
- "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
35
- )
36
-
37
  def flatten_json_obj(obj, parent_key="", sep="."):
 
38
  items = {}
39
  if isinstance(obj, dict):
40
  for k, v in obj.items():
41
  new_key = f"{parent_key}{sep}{k}" if parent_key else k
42
- if (
43
- k.lower() in {"customer", "user", "email", "username"} and
44
- isinstance(v, str) and "@" in v
45
- ):
46
- local = v.split("@")[0]
47
- local_clean = re.sub(r'[^a-zA-Z0-9]', ' ', local)
48
- parts = [part for part in local_clean.split() if part]
49
- if parts:
50
- items[new_key + "_name"] = parts[0].lower()
51
- items[new_key + "_all_names"] = " ".join(parts).lower()
52
  items.update(flatten_json_obj(v, new_key, sep=sep))
53
  elif isinstance(obj, list):
54
  for i, v in enumerate(obj):
@@ -58,218 +49,179 @@ def flatten_json_obj(obj, parent_key="", sep="."):
58
  items[parent_key] = obj
59
  return items
60
 
 
61
  def get_embedding(text):
62
- client = openai.OpenAI(api_key=OPENAI_API_KEY)
63
- response = client.embeddings.create(input=[text], model=EMBEDDING_MODEL)
64
- return response.data[0].embedding
65
 
 
66
  def ensure_table():
67
- conn = sqlite3.connect(DB_PATH)
68
- cursor = conn.cursor()
69
- cursor.execute("""
70
- CREATE TABLE IF NOT EXISTS json_records (
71
- id INTEGER PRIMARY KEY AUTOINCREMENT,
72
- batch_time TEXT,
73
- source_file TEXT,
74
- raw_json TEXT,
75
- flat_text TEXT,
76
- embedding BLOB
77
- )
78
- """)
79
- conn.commit()
80
- conn.close()
81
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def ingest_json_files(files):
83
  ensure_table()
84
  rows = []
85
  batch_time = datetime.datetime.utcnow().isoformat()
86
  for file in files:
87
- file.seek(0)
88
  raw = json.load(file)
89
  source_name = file.name
90
- records = raw if isinstance(raw, list) else [raw]
 
 
 
 
 
 
 
 
 
 
 
91
  for rec in records:
92
  flat = flatten_json_obj(rec)
 
 
 
 
 
93
  flat_text = "; ".join([f"{k}: {v}" for k, v in flat.items()])
94
- rows.append((batch_time, source_name, json.dumps(rec), flat_text))
95
- if not rows:
96
- st.warning("No records found in uploaded files!")
97
- return
98
- df = pd.DataFrame(rows, columns=["batch_time", "source_file", "raw_json", "flat_text"])
99
- st.write(f"Flattened {len(df)} records. Generating embeddings (this may take time, please wait)...")
100
  df["embedding"] = df["flat_text"].apply(get_embedding)
101
- conn = sqlite3.connect(DB_PATH)
102
- cursor = conn.cursor()
103
- for _, row in df.iterrows():
104
- emb_bytes = np.array(row.embedding, dtype=np.float32).tobytes()
105
- cursor.execute("""
106
- INSERT INTO json_records (batch_time, source_file, raw_json, flat_text, embedding)
107
- VALUES (?, ?, ?, ?, ?)
108
- """, (row.batch_time, row.source_file, row.raw_json, row.flat_text, emb_bytes))
109
- conn.commit()
110
- conn.close()
111
  st.success(f"Ingested and indexed {len(df)} new records!")
112
  st.session_state.ingested_batches += 1
113
 
114
- if uploaded_files and st.button("Ingest batch to database"):
115
- ingest_json_files(uploaded_files)
116
-
117
  def query_vector_db(user_query, top_k=5):
118
- query_emb = get_embedding(user_query)
119
- conn = sqlite3.connect(DB_PATH)
120
- cursor = conn.cursor()
121
- cursor.execute("SELECT id, batch_time, source_file, raw_json, flat_text, embedding FROM json_records")
122
  results = []
123
- for row in cursor.fetchall():
124
  db_emb = np.frombuffer(row[5], dtype=np.float32)
125
- if len(db_emb) != len(query_emb): continue
126
- sim = np.dot(query_emb, db_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(db_emb))
 
127
  results.append((sim, row))
128
- conn.close()
129
- results = sorted(results, reverse=True)[:top_k]
130
  docs = []
131
  for sim, row in results:
132
  meta = {
133
  "id": row[0],
134
- "batch_time": str(row[1]),
135
  "source_file": row[2],
136
- "similarity": f"{sim:.4f} (embedding)",
137
  "raw_json": row[3],
138
  }
139
  docs.append(Document(page_content=row[4], metadata=meta))
140
  return docs
141
 
142
- def python_fuzzy_match(user_query, top_k=5):
143
- query_terms = set(user_query.lower().replace("@", " ").replace(".", " ").split())
144
- conn = sqlite3.connect(DB_PATH)
145
- cursor = conn.cursor()
146
- cursor.execute("SELECT id, batch_time, source_file, raw_json, flat_text FROM json_records")
147
- results = []
148
- for row in cursor.fetchall():
149
- flat_text = row[4].lower()
150
- score = sum(any(term in flat_text for term in query_terms) for term in query_terms)
151
- if score > 0:
152
- results.append((score, row))
153
- conn.close()
154
- results = sorted(results, reverse=True)[:top_k]
155
- docs = []
156
- for score, row in results:
157
- meta = {
158
- "id": row[0],
159
- "batch_time": str(row[1]),
160
- "source_file": row[2],
161
- "similarity": f"{score} (fuzzy)",
162
- "raw_json": row[3],
163
- }
164
- docs.append(Document(page_content=row[4], metadata=meta))
165
- return docs
166
-
167
- def extract_main_entity(question):
168
- import re
169
- quoted = re.findall(r"['\"]([^'\"]+)['\"]", question)
170
- if quoted:
171
- return quoted[0].lower()
172
- email = re.findall(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", question)
173
- if email:
174
- return email[0].lower().split('@')[0]
175
- tokens = re.findall(r"\b([A-Za-z0-9]+)\b", question)
176
- stopwords = {"how", "much", "did", "spend", "was", "the", "is", "in", "on", "for", "a", "an", "of", "to", "with"}
177
- keywords = [t.lower() for t in tokens if t.lower() not in stopwords]
178
- if not keywords:
179
- return ""
180
- return max(keywords, key=len)
181
-
182
- def filter_records_by_entity(records, entity):
183
- if not entity:
184
- return records
185
- matches = []
186
- for doc in records:
187
- if entity in doc.page_content.lower():
188
- matches.append(doc)
189
- elif any(entity in v.lower() for v in doc.page_content.split(';')):
190
- matches.append(doc)
191
- return matches if matches else records
192
 
193
- def hybrid_query(user_query, top_k=5):
194
- vector_docs = query_vector_db(user_query, top_k=top_k)
195
- fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
196
- all_docs = []
197
- seen_ids = set()
198
- for doc in (vector_docs + fuzzy_docs):
199
- doc_id = doc.metadata.get("id")
200
- if doc_id not in seen_ids:
201
- all_docs.append(doc)
202
- seen_ids.add(doc_id)
203
- entity = extract_main_entity(user_query)
204
- entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
205
- if entity_docs:
206
- doc = entity_docs[0]
207
- return [doc]
208
- else:
209
- return all_docs[:1]
210
-
211
- class HybridRetriever(BaseRetriever):
212
- top_k: int = Field(default=5)
213
- def _get_relevant_documents(self, query, run_manager=None, **kwargs):
214
- return hybrid_query(query, self.top_k)
215
-
216
- system_prompt = (
217
- "You are a JSON data assistant. "
218
- "If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
219
- "and answer directly using the record's fields. "
220
- "For example, if 'customer: johnny.appleseed@gmail.com' and the question is about Johnny, you should use that record."
221
- "If you can't find the answer, reply: 'I don’t have that information.'"
222
- "Never make up data. Never ask for clarification."
223
- )
224
- prompt = ChatPromptTemplate.from_messages([
225
- ("system", system_prompt),
226
- ("human", "Here are the most relevant records:\n{context}\n\nQuestion: {question}")
227
- ])
228
-
229
- llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
230
- retriever = HybridRetriever(top_k=5)
231
  qa_chain = RetrievalQA.from_chain_type(
232
  llm=llm,
233
  retriever=retriever,
234
- chain_type_kwargs={"prompt": prompt},
235
  return_source_documents=True,
 
236
  )
237
 
238
- st.markdown("### Ask any question about your data, just like ChatGPT.")
 
 
 
 
 
 
239
 
240
- def show_tiny_json_links():
241
- # Only show for the last assistant answer if there are matching JSONs
242
- if not st.session_state.json_links:
243
- return
244
- for idx, link_key in enumerate(st.session_state.json_links):
245
- label = st.session_state.json_link_details[link_key]['label']
246
- rec = st.session_state.json_link_details[link_key]['record']
247
- expander_label = f"<span style='font-size:11px; color:#444; text-decoration:underline;'>[view JSON]</span> <span style='font-size:10px; color:#aaa'>{label}</span>"
248
- with st.expander(label="", expanded=False):
249
- st.markdown(expander_label, unsafe_allow_html=True)
250
- st.code(json.dumps(rec, indent=2), language="json")
251
- st.session_state.json_links = []
252
- st.session_state.json_link_details = {}
253
 
254
- for msg in st.session_state.messages:
255
- if msg["role"] == "user":
256
- st.markdown(f"<div style='color: #4F8BF9;'><b>User:</b> {msg['content']}</div>", unsafe_allow_html=True)
257
- elif msg["role"] == "assistant":
258
- st.markdown(f"<div style='color: #1C6E4C;'><b>Agent:</b> {msg['content']}</div>", unsafe_allow_html=True)
259
- show_tiny_json_links()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  def send_message():
262
  user_input = st.session_state.temp_input.strip()
263
  if not user_input:
264
  return
 
 
 
 
 
 
265
  st.session_state.messages.append({"role": "user", "content": user_input})
266
  with st.spinner("Thinking..."):
267
- result = qa_chain({"query": user_input})
268
  answer = result['result']
269
  st.session_state.messages.append({"role": "assistant", "content": answer})
270
  docs = result['source_documents']
271
  link_keys = []
272
  link_details = {}
 
 
273
  for idx, doc in enumerate(docs):
274
  link_key = f"json_{doc.metadata['id']}_{idx}"
275
  rec = json.loads(doc.metadata["raw_json"])
@@ -278,13 +230,26 @@ def send_message():
278
  link_keys.append(link_key)
279
  st.session_state.json_links = link_keys
280
  st.session_state.json_link_details = link_details
 
281
  st.session_state.temp_input = ""
282
 
283
- st.text_input("Your message:", key="temp_input", on_change=send_message)
 
 
 
 
 
 
 
 
 
284
 
 
285
  if st.button("Clear chat"):
286
  st.session_state.messages = []
287
  st.session_state.json_links = []
288
  st.session_state.json_link_details = {}
 
 
289
 
290
  st.info(f"Batches ingested so far (this session): {st.session_state.ingested_batches}")
 
 
1
  import streamlit as st
2
+ import os
 
 
3
  import json
4
+ import re
5
+ import sqlite3
6
+ import pandas as pd
7
  import numpy as np
8
  import datetime
9
+ from typing import List, Dict
10
+ import openai
11
  from langchain.schema import Document
12
+ from langchain.chains import RetrievalQA
13
+ from langchain_community.llms import OpenAI as LangOpenAI
 
 
14
 
15
+ # ---- CONFIG ----
16
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
  EMBEDDING_MODEL = "text-embedding-ada-002"
18
+ DB_FILE = "json_vector_store.db"
19
 
20
+ st.set_page_config(page_title="Chat with Your Vectorized JSON Files", layout="wide")
21
+
22
+ # --- Session State ---
23
  if "ingested_batches" not in st.session_state:
24
  st.session_state.ingested_batches = 0
25
  if "messages" not in st.session_state:
 
28
  st.session_state.json_links = []
29
  if "json_link_details" not in st.session_state:
30
  st.session_state.json_link_details = {}
31
+ if "modal_link" not in st.session_state:
32
+ st.session_state.modal_link = None
33
+ if "last_entity" not in st.session_state:
34
+ st.session_state.last_entity = None
35
 
36
+ # ---- Helper: Flatten JSON ----
 
 
 
 
 
 
37
  def flatten_json_obj(obj, parent_key="", sep="."):
38
+ """Flatten nested JSON objects/lists with dot notation."""
39
  items = {}
40
  if isinstance(obj, dict):
41
  for k, v in obj.items():
42
  new_key = f"{parent_key}{sep}{k}" if parent_key else k
 
 
 
 
 
 
 
 
 
 
43
  items.update(flatten_json_obj(v, new_key, sep=sep))
44
  elif isinstance(obj, list):
45
  for i, v in enumerate(obj):
 
49
  items[parent_key] = obj
50
  return items
51
 
52
+ # ---- Helper: Get OpenAI Embedding ----
53
  def get_embedding(text):
54
+ openai.api_key = OPENAI_API_KEY
55
+ resp = openai.embeddings.create(input=[text], model=EMBEDDING_MODEL)
56
+ return resp.data[0].embedding
57
 
58
+ # ---- SQLite DB Setup ----
59
  def ensure_table():
60
+ with sqlite3.connect(DB_FILE) as conn:
61
+ c = conn.cursor()
62
+ c.execute("""
63
+ CREATE TABLE IF NOT EXISTS json_records (
64
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
65
+ batch_time TEXT,
66
+ source_file TEXT,
67
+ raw_json TEXT,
68
+ flat_text TEXT,
69
+ embedding BLOB
70
+ )
71
+ """)
72
+ conn.commit()
73
+
74
+ def insert_records(records):
75
+ with sqlite3.connect(DB_FILE) as conn:
76
+ c = conn.cursor()
77
+ c.executemany(
78
+ "INSERT INTO json_records (batch_time, source_file, raw_json, flat_text, embedding) VALUES (?, ?, ?, ?, ?)",
79
+ records
80
+ )
81
+ conn.commit()
82
+
83
+ def all_records():
84
+ with sqlite3.connect(DB_FILE) as conn:
85
+ c = conn.cursor()
86
+ c.execute("SELECT id, batch_time, source_file, raw_json, flat_text, embedding FROM json_records")
87
+ return c.fetchall()
88
+
89
+ # ---- Ingest JSON Batch ----
90
  def ingest_json_files(files):
91
  ensure_table()
92
  rows = []
93
  batch_time = datetime.datetime.utcnow().isoformat()
94
  for file in files:
 
95
  raw = json.load(file)
96
  source_name = file.name
97
+ # Handle top-level list/dict
98
+ if isinstance(raw, list):
99
+ records = raw
100
+ elif isinstance(raw, dict):
101
+ # If dict with a single main list, use it
102
+ main_lists = [v for v in raw.values() if isinstance(v, list)]
103
+ if main_lists:
104
+ records = main_lists[0]
105
+ else:
106
+ records = [raw]
107
+ else:
108
+ records = [raw]
109
  for rec in records:
110
  flat = flatten_json_obj(rec)
111
+ # Heuristic: add top-level "name"/"customer" fields for entity tracking
112
+ if "customer" in rec and isinstance(rec["customer"], str):
113
+ first_name = rec["customer"].split("@")[0].replace(".", " ")
114
+ flat["customer_name"] = first_name
115
+ flat["customer_all_names"] = first_name.replace(".", " ")
116
  flat_text = "; ".join([f"{k}: {v}" for k, v in flat.items()])
117
+ rows.append((batch_time, source_name, json.dumps(rec), flat_text, None))
118
+ df = pd.DataFrame(rows, columns=["batch_time", "source_file", "raw_json", "flat_text", "embedding"])
119
+ st.write(f"Flattened {len(df)} records. Generating embeddings...")
 
 
 
120
  df["embedding"] = df["flat_text"].apply(get_embedding)
121
+ sql_rows = [
122
+ (
123
+ row.batch_time, row.source_file, row.raw_json, row.flat_text,
124
+ sqlite3.Binary(np.array(row.embedding, dtype=np.float32).tobytes())
125
+ )
126
+ for _, row in df.iterrows()
127
+ ]
128
+ insert_records(sql_rows)
 
 
129
  st.success(f"Ingested and indexed {len(df)} new records!")
130
  st.session_state.ingested_batches += 1
131
 
132
+ # ---- Hybrid Retrieval ----
 
 
133
  def query_vector_db(user_query, top_k=5):
134
+ query_emb = np.array(get_embedding(user_query), dtype=np.float32)
 
 
 
135
  results = []
136
+ for row in all_records():
137
  db_emb = np.frombuffer(row[5], dtype=np.float32)
138
+ if len(db_emb) != len(query_emb):
139
+ continue
140
+ sim = float(np.dot(query_emb, db_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(db_emb)))
141
  results.append((sim, row))
142
+ # Top K by similarity
143
+ results = sorted(results, reverse=True, key=lambda x: x[0])[:top_k]
144
  docs = []
145
  for sim, row in results:
146
  meta = {
147
  "id": row[0],
148
+ "batch_time": row[1],
149
  "source_file": row[2],
150
+ "similarity": f"{sim:.4f}",
151
  "raw_json": row[3],
152
  }
153
  docs.append(Document(page_content=row[4], metadata=meta))
154
  return docs
155
 
156
+ # ---- LangChain Retriever Adapter ----
157
+ class SQLiteVectorRetriever:
158
+ def get_relevant_documents(self, query):
159
+ return query_vector_db(query, top_k=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
+ # ---- LangChain LLM & QA Chain ----
162
+ llm = LangOpenAI(model_name="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
163
+ retriever = SQLiteVectorRetriever()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  qa_chain = RetrievalQA.from_chain_type(
165
  llm=llm,
166
  retriever=retriever,
 
167
  return_source_documents=True,
168
+ chain_type_kwargs={"input_key": "query"}
169
  )
170
 
171
+ # ---- Ingestion UI ----
172
+ st.title("Chat with Your Vectorized JSON Files (Hybrid Retrieval, SQLite, LLM)")
173
+ uploaded_files = st.file_uploader(
174
+ "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
175
+ )
176
+ if uploaded_files and st.button("Ingest batch to database"):
177
+ ingest_json_files(uploaded_files)
178
 
179
+ # ---- Conversation UI ----
180
+ st.markdown("### Ask any question about your data, just like ChatGPT.")
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ def update_last_entity(doc):
183
+ try:
184
+ rec = json.loads(doc.metadata["raw_json"])
185
+ if "customer" in rec and "@" in rec["customer"]:
186
+ st.session_state.last_entity = rec["customer"]
187
+ elif "customer_name" in rec:
188
+ st.session_state.last_entity = rec["customer_name"]
189
+ except Exception:
190
+ pass
191
+
192
+ def render_json_links():
193
+ # Tiny inline [view JSON] links, expands in-place on click
194
+ for key in st.session_state.json_links:
195
+ info = st.session_state.json_link_details[key]
196
+ label = info["label"]
197
+ rec = info["record"]
198
+ if st.button(f"[view JSON] {label}", key=key, help="Show JSON record", use_container_width=False):
199
+ st.session_state.modal_link = key
200
+ if st.session_state.modal_link:
201
+ info = st.session_state.json_link_details[st.session_state.modal_link]
202
+ with st.container():
203
+ st.code(json.dumps(info["record"], indent=2), language="json")
204
 
205
  def send_message():
206
  user_input = st.session_state.temp_input.strip()
207
  if not user_input:
208
  return
209
+ # Entity resolution for pronouns (he, his, etc.)
210
+ pronoun = re.search(r"\b(he|his|him|her|she|their)\b", user_input, re.I)
211
+ if st.session_state.last_entity and pronoun:
212
+ q = f"For {st.session_state.last_entity}: {user_input}"
213
+ else:
214
+ q = user_input
215
  st.session_state.messages.append({"role": "user", "content": user_input})
216
  with st.spinner("Thinking..."):
217
+ result = qa_chain({"query": q})
218
  answer = result['result']
219
  st.session_state.messages.append({"role": "assistant", "content": answer})
220
  docs = result['source_documents']
221
  link_keys = []
222
  link_details = {}
223
+ if docs:
224
+ update_last_entity(docs[0])
225
  for idx, doc in enumerate(docs):
226
  link_key = f"json_{doc.metadata['id']}_{idx}"
227
  rec = json.loads(doc.metadata["raw_json"])
 
230
  link_keys.append(link_key)
231
  st.session_state.json_links = link_keys
232
  st.session_state.json_link_details = link_details
233
+ st.session_state.modal_link = None # reset on every new message
234
  st.session_state.temp_input = ""
235
 
236
+ # ---- Chat Conversation Rendering ----
237
+ for msg in st.session_state.messages:
238
+ if msg["role"] == "user":
239
+ st.markdown(f"<b style='color:#3575dd'>User:</b> <span style='color:#111'>{msg['content']}</span>", unsafe_allow_html=True)
240
+ elif msg["role"] == "assistant":
241
+ st.markdown(f"<b style='color:#1c6e4c'>Agent:</b> <span style='color:#111'>{msg['content']}</span>", unsafe_allow_html=True)
242
+
243
+ if st.session_state.json_links:
244
+ st.markdown("<b>Function Output:</b>", unsafe_allow_html=True)
245
+ render_json_links()
246
 
247
+ st.text_input("Your message:", key="temp_input", on_change=send_message)
248
  if st.button("Clear chat"):
249
  st.session_state.messages = []
250
  st.session_state.json_links = []
251
  st.session_state.json_link_details = {}
252
+ st.session_state.modal_link = None
253
+ st.session_state.last_entity = None
254
 
255
  st.info(f"Batches ingested so far (this session): {st.session_state.ingested_batches}")