Seth0330 commited on
Commit
cf439c3
·
verified ·
1 Parent(s): 71d489c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -418
app.py CHANGED
@@ -1,23 +1,25 @@
1
- import streamlit as st
2
  import os
 
 
 
 
3
  import json
4
- import requests
5
- import traceback
6
- import difflib
7
-
8
- COMMON_FEMALE_NAMES = {
9
- "alice", "mary", "lisa", "jane", "emily", "sophia", "emma", "olivia",
10
- "ava", "mia", "isabella", "charlotte", "amelia", "harper", "abigail"
11
- }
12
-
13
- if "json_data" not in st.session_state:
14
- st.session_state.json_data = {}
15
- if "messages" not in st.session_state:
16
- st.session_state.messages = []
17
- if "files_loaded" not in st.session_state:
18
- st.session_state.files_loaded = False
19
- if "temp_input" not in st.session_state:
20
- st.session_state.temp_input = ""
21
  if "modal_open" not in st.session_state:
22
  st.session_state.modal_open = False
23
  if "modal_content" not in st.session_state:
@@ -25,420 +27,179 @@ if "modal_content" not in st.session_state:
25
  if "modal_title" not in st.session_state:
26
  st.session_state.modal_title = ""
27
 
28
- st.set_page_config(page_title="Chat with Your JSONs", layout="wide")
29
- st.title("Chat with Your JSON Files (OpenAI function-calling, No LangChain)")
30
 
31
- uploaded_files = st.sidebar.file_uploader(
32
- "Choose one or more JSON files", type="json", accept_multiple_files=True
33
  )
34
- if uploaded_files and not st.session_state.files_loaded:
35
- st.session_state.json_data.clear()
36
- for f in uploaded_files:
37
- try:
38
- content = json.load(f)
39
- st.session_state.json_data[f.name] = content
40
- st.sidebar.success(f"Loaded: {f.name}")
41
- except Exception as e:
42
- st.sidebar.error(f"Error reading {f.name}: {e}")
43
- st.session_state.files_loaded = True
44
- st.session_state.messages = []
45
- elif not uploaded_files:
46
- st.session_state.json_data.clear()
47
- st.session_state.files_loaded = False
48
-
49
- def normalize(s):
50
- return ' '.join(str(s).lower().replace("_", " ").replace("-", " ").replace(".", " ").split())
51
-
52
- def is_fuzzy_match(a, b, threshold=0.7):
53
- ratio = difflib.SequenceMatcher(None, a, b).ratio()
54
- return ratio >= threshold or a in b or b in a
55
-
56
- def search_all_jsons(key, value):
57
- matches = []
58
- value_norm = normalize(value)
59
- for file_name, data in st.session_state.json_data.items():
60
- def recursive_search(obj):
61
- if isinstance(obj, dict):
62
- for k, v in obj.items():
63
- if normalize(k) == normalize(key):
64
- if isinstance(v, (str, int, float, bool)) and is_fuzzy_match(value_norm, normalize(v)):
65
- matches.append({
66
- "file": file_name,
67
- "key": k,
68
- "value": v,
69
- "record": obj
70
- })
71
- recursive_search(v)
72
- elif isinstance(obj, list):
73
- for item in obj:
74
- recursive_search(item)
75
- recursive_search(data)
76
- return matches
77
-
78
- def fuzzy_value_search(value):
79
- matches = []
80
- value_norm = normalize(value)
81
- for file_name, data in st.session_state.json_data.items():
82
- def recursive_search(obj):
83
- if isinstance(obj, dict):
84
- for k, v in obj.items():
85
- if isinstance(v, (str, int, float, bool)) and is_fuzzy_match(value_norm, normalize(v)):
86
- matches.append({
87
- "file": file_name,
88
- "key": k,
89
- "value": v,
90
- "record": obj
91
- })
92
- recursive_search(v)
93
- elif isinstance(obj, list):
94
- for item in obj:
95
- recursive_search(item)
96
- recursive_search(data)
97
- return matches
98
 
99
- def list_keys(file_name):
100
- try:
101
- data = st.session_state.json_data[file_name]
102
- if isinstance(data, dict):
103
- return list(data.keys())
104
- elif isinstance(data, list) and data and isinstance(data[0], dict):
105
- return list(data[0].keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  else:
107
- return []
108
- except Exception as e:
109
- return {"error": str(e)}
110
-
111
- def count_key_occurrences(file_name, key):
112
- try:
113
- data = st.session_state.json_data[file_name]
114
- count = 0
115
- def recursive(obj):
116
- nonlocal count
117
- if isinstance(obj, dict):
118
- for k, v in obj.items():
119
- if normalize(k) == normalize(key):
120
- count += 1
121
- recursive(v)
122
- elif isinstance(obj, list):
123
- for item in obj:
124
- recursive(item)
125
- recursive(data)
126
- return count
127
- except Exception as e:
128
- return {"error": str(e)}
129
-
130
- def find_in_arrays(key, value, return_count=True):
131
- matches = []
132
- count = 0
133
- for file_name, data in st.session_state.json_data.items():
134
- def recursive(obj):
135
- nonlocal count
136
- if isinstance(obj, list):
137
- for item in obj:
138
- if isinstance(item, dict):
139
- for k, v in item.items():
140
- if normalize(k) == normalize(key) and normalize(str(v)) == normalize(str(value)):
141
- matches.append({
142
- "file": file_name,
143
- "item": item,
144
- "array_path": key,
145
- "record": item # show the item dict itself for popup
146
- })
147
- count += 1
148
- recursive(item)
149
- elif isinstance(obj, dict):
150
- for v in obj.values():
151
- recursive(v)
152
- recursive(data)
153
- return count if return_count else matches
154
-
155
- def sum_field_by_name(name, field="amount"):
156
- total = 0
157
- details = []
158
- name_norm = normalize(name)
159
- for file_name, data in st.session_state.json_data.items():
160
- def recursive(obj):
161
- nonlocal total
162
- if isinstance(obj, dict):
163
- for k, v in obj.items():
164
- if isinstance(v, (str, int, float, bool)) and is_fuzzy_match(name_norm, normalize(v)):
165
- if field in obj:
166
- try:
167
- amt = float(obj[field])
168
- total += amt
169
- details.append({"file": file_name, "name_match": v, "amount": amt, "record": obj})
170
- except Exception:
171
- pass
172
- recursive(v)
173
- elif isinstance(obj, list):
174
- for item in obj:
175
- recursive(item)
176
- recursive(data)
177
- return {"total": total, "matches": details}
178
-
179
- def count_female_names():
180
- count = 0
181
- names = []
182
- for file_name, data in st.session_state.json_data.items():
183
- def recursive(obj):
184
- nonlocal count
185
- if isinstance(obj, dict):
186
- for k, v in obj.items():
187
- if k.lower() in {"name", "fullName", "firstName"}:
188
- first_name = str(v).split()[0].lower()
189
- if first_name in COMMON_FEMALE_NAMES:
190
- count += 1
191
- names.append({"file": file_name, "name": v, "record": obj})
192
- recursive(v)
193
- elif isinstance(obj, list):
194
- for item in obj:
195
- recursive(item)
196
- recursive(data)
197
- return {"count": count, "names": names}
198
-
199
- function_schema = [
200
- {
201
- "name": "search_all_jsons",
202
- "description": "Recursively search all uploaded JSONs for all records where a key matches a value (fuzzy, any type).",
203
- "parameters": {
204
- "type": "object",
205
- "properties": {
206
- "key": {"type": "string"},
207
- "value": {"type": "string"}
208
- },
209
- "required": ["key", "value"]
210
- }
211
- },
212
- {
213
- "name": "fuzzy_value_search",
214
- "description": "Search all uploaded JSONs for any record with a field value matching (fuzzy, all types).",
215
- "parameters": {
216
- "type": "object",
217
- "properties": {
218
- "value": {"type": "string"}
219
- },
220
- "required": ["value"]
221
- }
222
- },
223
- {
224
- "name": "list_keys",
225
- "description": "List top-level keys in a given JSON file.",
226
- "parameters": {
227
- "type": "object",
228
- "properties": {
229
- "file_name": {"type": "string"}
230
- },
231
- "required": ["file_name"]
232
  }
233
- },
234
- {
235
- "name": "count_key_occurrences",
236
- "description": "Count number of times a key appears in a file.",
237
- "parameters": {
238
- "type": "object",
239
- "properties": {
240
- "file_name": {"type": "string"},
241
- "key": {"type": "string"}
242
- },
243
- "required": ["file_name", "key"]
244
- }
245
- },
246
- {
247
- "name": "find_in_arrays",
248
- "description": "Find/count all objects in any arrays/lists where key equals value (e.g. done:true for completed tasks).",
249
- "parameters": {
250
- "type": "object",
251
- "properties": {
252
- "key": {"type": "string", "description": "The key to search for, e.g., 'done'"},
253
- "value": {"type": "string", "description": "The value to match, e.g., 'true' or 'false'"},
254
- "return_count": {"type": "boolean", "description": "Return the count (true) or matching records (false)."}
255
- },
256
- "required": ["key", "value"]
257
- }
258
- },
259
- {
260
- "name": "sum_field_by_name",
261
- "description": "Sum a field (e.g. amount) for any record containing a name/email/identifier. Returns total and breakdown.",
262
- "parameters": {
263
- "type": "object",
264
- "properties": {
265
- "name": {"type": "string", "description": "Name or identifier to match"},
266
- "field": {"type": "string", "description": "The numeric field to sum, e.g. 'amount'"},
267
- },
268
- "required": ["name", "field"]
269
- }
270
- },
271
- {
272
- "name": "count_female_names",
273
- "description": "Count the number of common female names based on a preset list.",
274
- "parameters": {
275
- "type": "object",
276
- "properties": {},
277
- }
278
- }
279
- ]
280
-
281
- system_message = {
282
- "role": "system",
283
- "content": (
284
- "You are a JSON data assistant. Use the functions provided to answer the user's question. "
285
- "If the user asks for the number or details of items in a list/array (e.g., completed tasks), use 'find_in_arrays'. "
286
- "If the user asks about the sum/total of a field for a name or identifier, use 'sum_field_by_name'. "
287
- "If the user asks about female names, use 'count_female_names'. "
288
- "If the user's query does not mention a key, use 'fuzzy_value_search' to match on any value. "
289
- "If a key is mentioned (like 'apps_installed'), use 'search_all_jsons' for that key and the value. "
290
- "You may use 'list_keys' to help discover the file structure if needed. "
291
- "Always give a direct answer from the data if possible."
292
- )
293
- }
294
-
295
- st.markdown("### Ask any question about your data, just like ChatGPT.")
296
 
297
- for msg in st.session_state.messages:
298
- if msg["role"] == "user":
299
- st.markdown(f"<div style='color: #4F8BF9;'><b>User:</b> {msg['content']}</div>", unsafe_allow_html=True)
300
- elif msg["role"] == "assistant":
301
- st.markdown(f"<div style='color: #1C6E4C;'><b>Agent:</b> {msg['content']}</div>", unsafe_allow_html=True)
302
- elif msg["role"] == "function":
303
- st.markdown(f"<details><summary><b>Function '{msg['name']}' output:</b></summary><pre>{msg['content']}</pre></details>", unsafe_allow_html=True)
304
 
305
- # --- JSON MODAL POPUP ---
306
  def show_json_links_and_modal():
307
- # Find last function message
308
- for msg in reversed(st.session_state.messages):
309
- if msg.get("role") == "function":
310
- func_name = msg.get("name")
311
- try:
312
- content = json.loads(msg["content"])
313
- except Exception:
314
- content = None
315
- links_shown = False
316
- if isinstance(content, list):
317
- for idx, match in enumerate(content):
318
- if isinstance(match, dict) and "record" in match:
319
- if st.button(f"View JSON: {match.get('file', 'unknown')} record #{idx+1}", key=f"modal_{func_name}_{idx}"):
320
- st.session_state.modal_open = True
321
- st.session_state.modal_content = json.dumps(match["record"], indent=2)
322
- st.session_state.modal_title = f"{match.get('file', 'unknown')} record #{idx+1}"
323
- links_shown = True
324
- elif isinstance(content, dict):
325
- # For dicts with matches
326
- if "matches" in content and isinstance(content["matches"], list):
327
- for idx, match in enumerate(content["matches"]):
328
- if isinstance(match, dict) and "record" in match:
329
- if st.button(f"View JSON: {match.get('file', 'unknown')} record #{idx+1}", key=f"modal_{func_name}_matches_{idx}"):
330
- st.session_state.modal_open = True
331
- st.session_state.modal_content = json.dumps(match["record"], indent=2)
332
- st.session_state.modal_title = f"{match.get('file', 'unknown')} record #{idx+1}"
333
- links_shown = True
334
- if "names" in content and isinstance(content["names"], list):
335
- for idx, match in enumerate(content["names"]):
336
- if isinstance(match, dict) and "record" in match:
337
- if st.button(f"View JSON: {match.get('file', 'unknown')} record #{idx+1}", key=f"modal_{func_name}_names_{idx}"):
338
- st.session_state.modal_open = True
339
- st.session_state.modal_content = json.dumps(match["record"], indent=2)
340
- st.session_state.modal_title = f"{match.get('file', 'unknown')} record #{idx+1}"
341
- links_shown = True
342
- if links_shown:
343
- break
344
-
345
- # Modal popup UI using st.expander as a modal hack
346
  if st.session_state.modal_open:
347
  with st.expander(f"JSON Record: {st.session_state.modal_title}", expanded=True):
348
  st.code(st.session_state.modal_content, language="json")
349
  if st.button("Close", key="close_modal"):
350
  st.session_state.modal_open = False
351
 
352
- show_json_links_and_modal()
353
-
354
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
355
- HEADERS = {
356
- "Authorization": f"Bearer {OPENAI_API_KEY}",
357
- "Content-Type": "application/json",
358
- }
359
-
360
- def send_message():
361
- try:
362
- user_input = st.session_state.temp_input
363
- if user_input.strip():
364
- st.session_state.messages.append({"role": "user", "content": user_input})
365
- chat_messages = [system_message] + st.session_state.messages[-10:]
366
- chat_resp = requests.post(
367
- "https://api.openai.com/v1/chat/completions",
368
- headers=HEADERS,
369
- json={
370
- "model": "gpt-4o",
371
- "messages": chat_messages,
372
- "functions": function_schema,
373
- "function_call": "auto",
374
- "temperature": 0,
375
- "max_tokens": 1200,
376
- },
377
- timeout=60,
378
- )
379
- chat_resp.raise_for_status()
380
- response_json = chat_resp.json()
381
- msg = response_json["choices"][0]["message"]
382
 
383
- if msg.get("function_call"):
384
- func_name = msg["function_call"]["name"]
385
- args_json = msg["function_call"]["arguments"]
386
- args = json.loads(args_json)
387
-
388
- if func_name == "search_all_jsons":
389
- result = search_all_jsons(args.get("key"), args.get("value"))
390
- elif func_name == "fuzzy_value_search":
391
- result = fuzzy_value_search(args.get("value"))
392
- elif func_name == "list_keys":
393
- result = list_keys(args.get("file_name"))
394
- elif func_name == "count_key_occurrences":
395
- result = count_key_occurrences(args.get("file_name"), args.get("key"))
396
- elif func_name == "find_in_arrays":
397
- result = find_in_arrays(
398
- args.get("key"),
399
- args.get("value"),
400
- args.get("return_count", True)
401
- )
402
- elif func_name == "sum_field_by_name":
403
- result = sum_field_by_name(
404
- args.get("name"),
405
- args.get("field", "amount")
406
- )
407
- elif func_name == "count_female_names":
408
- result = count_female_names()
409
- else:
410
- result = {"error": f"Unknown function: {func_name}"}
411
-
412
- st.session_state.messages.append({
413
- "role": "function",
414
- "name": func_name,
415
- "content": json.dumps(result, indent=2),
416
- })
417
 
418
- followup_messages = chat_messages + [
419
- {"role": "function", "name": func_name, "content": json.dumps(result, indent=2)}
420
- ]
421
- final_resp = requests.post(
422
- "https://api.openai.com/v1/chat/completions",
423
- headers=HEADERS,
424
- json={
425
- "model": "gpt-4o",
426
- "messages": followup_messages,
427
- "temperature": 0,
428
- "max_tokens": 1200,
429
- },
430
- timeout=60,
431
- )
432
- final_resp.raise_for_status()
433
- final_json = final_resp.json()
434
- answer = final_json["choices"][0]["message"]["content"]
435
- st.session_state.messages.append({"role": "assistant", "content": answer})
436
- st.session_state.temp_input = ""
437
- except Exception as e:
438
- st.error("Exception: " + str(e))
439
- st.code(traceback.format_exc())
440
 
441
- if st.session_state.json_data:
442
- st.text_input("Your message:", key="temp_input", on_change=send_message)
443
- else:
444
- st.info("Please upload at least one JSON file to start chatting.")
 
 
1
  import os
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import openai
5
+ import pyodbc
6
  import json
7
+ import numpy as np
8
+ import datetime
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.llms import OpenAI
11
+ from langchain.schema import Document
12
+
13
+ # --- CONFIG ---
14
+ AZURE_SQL_CONN_STR = "DRIVER={ODBC Driver 17 for SQL Server};SERVER=<server>.database.windows.net;DATABASE=<db>;UID=<user>;PWD=<password>"
15
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Or paste your key here
16
+ EMBEDDING_MODEL = "text-embedding-ada-002" # Or your Azure embedding model
17
+
18
+ # --- Streamlit State Initialization ---
19
+ if "ingested_batches" not in st.session_state:
20
+ st.session_state.ingested_batches = 0
21
+ if "chat_history" not in st.session_state:
22
+ st.session_state.chat_history = []
 
23
  if "modal_open" not in st.session_state:
24
  st.session_state.modal_open = False
25
  if "modal_content" not in st.session_state:
 
27
  if "modal_title" not in st.session_state:
28
  st.session_state.modal_title = ""
29
 
30
+ st.set_page_config(page_title="Cumulative JSON Vector Search", layout="wide")
31
+ st.title("LLM-Powered Analytics: Cumulative JSON Vector DB (Azure SQL)")
32
 
33
+ uploaded_files = st.file_uploader(
34
+ "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
35
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # --- Helper: Flatten any unstructured JSON (handles dict, list, nested, various keys) ---
38
+ def flatten_json_obj(obj, parent_key="", sep="."):
39
+ items = {}
40
+ if isinstance(obj, dict):
41
+ for k, v in obj.items():
42
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
43
+ items.update(flatten_json_obj(v, new_key, sep=sep))
44
+ elif isinstance(obj, list):
45
+ for i, v in enumerate(obj):
46
+ new_key = f"{parent_key}{sep}{i}" if parent_key else str(i)
47
+ items.update(flatten_json_obj(v, new_key, sep=sep))
48
+ else:
49
+ items[parent_key] = obj
50
+ return items
51
+
52
+ # --- Embedding function ---
53
+ def get_embedding(text):
54
+ openai.api_key = OPENAI_API_KEY
55
+ resp = openai.Embedding.create(input=text, model=EMBEDDING_MODEL)
56
+ return resp['data'][0]['embedding']
57
+
58
+ # --- Ensure DB Table (accumulates all uploads, never deletes old data) ---
59
+ def ensure_table():
60
+ conn = pyodbc.connect(AZURE_SQL_CONN_STR)
61
+ cursor = conn.cursor()
62
+ cursor.execute("""
63
+ IF OBJECT_ID('dbo.json_records', 'U') IS NULL
64
+ CREATE TABLE json_records (
65
+ id INT PRIMARY KEY IDENTITY,
66
+ batch_time DATETIME,
67
+ source_file NVARCHAR(255),
68
+ raw_json NVARCHAR(MAX),
69
+ flat_text NVARCHAR(MAX),
70
+ embedding VARBINARY(MAX)
71
+ );
72
+ """)
73
+ conn.commit()
74
+ conn.close()
75
+
76
+ # --- Ingest and accumulate uploaded files ---
77
+ def ingest_json_files(files):
78
+ ensure_table()
79
+ rows = []
80
+ batch_time = datetime.datetime.utcnow()
81
+ for file in files:
82
+ raw = json.load(file)
83
+ source_name = file.name
84
+ # Handle top-level list or dict
85
+ if isinstance(raw, list):
86
+ records = raw
87
+ elif isinstance(raw, dict):
88
+ # If nested records (like {"people": [...]})
89
+ main_lists = [v for v in raw.values() if isinstance(v, list)]
90
+ if main_lists:
91
+ records = main_lists[0]
92
+ else:
93
+ records = [raw]
94
  else:
95
+ records = [raw]
96
+ for rec in records:
97
+ flat = flatten_json_obj(rec)
98
+ flat_text = "; ".join([f"{k}: {v}" for k, v in flat.items()])
99
+ rows.append((batch_time, source_name, json.dumps(rec), flat_text))
100
+ if not rows:
101
+ st.warning("No records found in uploaded files!")
102
+ return
103
+ df = pd.DataFrame(rows, columns=["batch_time", "source_file", "raw_json", "flat_text"])
104
+ st.write(f"Flattened {len(df)} records. Generating embeddings (this may take time, please wait)...")
105
+ df["embedding"] = df["flat_text"].apply(get_embedding)
106
+ # Insert into DB
107
+ conn = pyodbc.connect(AZURE_SQL_CONN_STR)
108
+ cursor = conn.cursor()
109
+ for _, row in df.iterrows():
110
+ emb_bytes = bytearray(np.array(row.embedding, dtype=np.float32).tobytes())
111
+ cursor.execute("""
112
+ INSERT INTO json_records (batch_time, source_file, raw_json, flat_text, embedding)
113
+ VALUES (?, ?, ?, ?, ?)
114
+ """, row.batch_time, row.source_file, row.raw_json, row.flat_text, emb_bytes)
115
+ conn.commit()
116
+ conn.close()
117
+ st.success(f"Ingested and indexed {len(df)} new records!")
118
+ st.session_state.ingested_batches += 1
119
+
120
+ if uploaded_files and st.button("Ingest batch to database"):
121
+ ingest_json_files(uploaded_files)
122
+
123
+ # --- Query entire cumulative DB (ALL past and present records) ---
124
+ def query_vector_db(user_query, top_k=5):
125
+ query_emb = get_embedding(user_query)
126
+ conn = pyodbc.connect(AZURE_SQL_CONN_STR)
127
+ cursor = conn.cursor()
128
+ cursor.execute("SELECT id, batch_time, source_file, raw_json, flat_text, embedding FROM json_records")
129
+ results = []
130
+ for row in cursor.fetchall():
131
+ db_emb = np.frombuffer(row.embedding, dtype=np.float32)
132
+ if len(db_emb) != len(query_emb): continue # Skip malformed
133
+ sim = np.dot(query_emb, db_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(db_emb))
134
+ results.append((sim, row))
135
+ conn.close()
136
+ results = sorted(results, reverse=True)[:top_k]
137
+ docs = []
138
+ for sim, row in results:
139
+ meta = {
140
+ "id": row.id,
141
+ "batch_time": str(row.batch_time),
142
+ "source_file": row.source_file,
143
+ "similarity": f"{sim:.4f}",
144
+ "raw_json": row.raw_json,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  }
146
+ docs.append(Document(page_content=row.flat_text, metadata=meta))
147
+ return docs
148
+
149
+ # --- LangChain Retriever ---
150
+ class AzureSQLVectorRetriever:
151
+ def __init__(self, top_k=5):
152
+ self.top_k = top_k
153
+ def get_relevant_documents(self, query):
154
+ return query_vector_db(query, self.top_k)
155
+
156
+ llm = OpenAI(model="gpt-4o", openai_api_key=OPENAI_API_KEY, temperature=0)
157
+ retriever = AzureSQLVectorRetriever(top_k=5)
158
+ qa_chain = RetrievalQA.from_chain_type(
159
+ llm=llm,
160
+ retriever=retriever,
161
+ return_source_documents=True,
162
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ # --- Chat UI & Conversation Loop (preserves your history/modal system) ---
165
+ st.header("Chat with all accumulated records")
 
 
 
 
 
166
 
 
167
  def show_json_links_and_modal():
168
+ # Scan last result for JSON modal links
169
+ for speaker, msg in reversed(st.session_state.chat_history):
170
+ if speaker == "AI_DOCS":
171
+ docs = msg
172
+ for idx, doc in enumerate(docs):
173
+ if st.button(f"View JSON: {doc.metadata['source_file']} (#{doc.metadata['id']})", key=f"modal_{idx}"):
174
+ st.session_state.modal_open = True
175
+ st.session_state.modal_content = json.dumps(json.loads(doc.metadata["raw_json"]), indent=2)
176
+ st.session_state.modal_title = f"{doc.metadata['source_file']} (#{doc.metadata['id']})"
177
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  if st.session_state.modal_open:
179
  with st.expander(f"JSON Record: {st.session_state.modal_title}", expanded=True):
180
  st.code(st.session_state.modal_content, language="json")
181
  if st.button("Close", key="close_modal"):
182
  st.session_state.modal_open = False
183
 
184
+ # --- Chat input ---
185
+ user_input = st.text_input("Ask a question about ALL data (old and new):", key="user_input")
186
+ if st.button("Send") and user_input:
187
+ with st.spinner("Thinking..."):
188
+ result = qa_chain(user_input)
189
+ st.session_state.chat_history.append(("User", user_input))
190
+ st.session_state.chat_history.append(("AI", result['result']))
191
+ st.session_state.chat_history.append(("AI_DOCS", result['source_documents']))
192
+
193
+ # --- Show conversation ---
194
+ for speaker, msg in st.session_state.chat_history:
195
+ if speaker == "User":
196
+ st.markdown(f"<div style='color: #4F8BF9;'><b>User:</b> {msg}</div>", unsafe_allow_html=True)
197
+ elif speaker == "AI":
198
+ st.markdown(f"<div style='color: #1C6E4C;'><b>Agent:</b> {msg}</div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ show_json_links_and_modal()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ if st.button("Clear chat"):
203
+ st.session_state.chat_history = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ st.info(f"Batches ingested so far (this session): {st.session_state.ingested_batches}")