Rajan Sharma commited on
Commit
1ca7039
·
verified ·
1 Parent(s): 4f1d205

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -125
app.py CHANGED
@@ -47,20 +47,17 @@ from huggingface_hub import login
47
 
48
  from safety import safety_filter, refusal_reply
49
  from retriever import init_retriever, retrieve_context
50
-
51
- # ---------- Snapshot & retrieval helpers import ----------
52
- # Use the real function if present; otherwise fall back to a harmless no-op.
53
- try:
54
- from decision_math import compute_operational_numbers
55
- except Exception:
56
- def compute_operational_numbers(snapshot: dict) -> dict:
57
- return {}
58
-
59
  from prompt_templates import build_system_preamble
60
  from upload_ingest import extract_text_from_files
61
  from session_rag import SessionRAG
62
  from mdsi_analysis import capacity_projection, cost_estimate, outcomes_summary
63
 
 
 
 
 
 
64
  # ---------- Config ----------
65
  MODEL_ID = os.getenv("MODEL_ID", "microsoft/Phi-3-mini-4k-instruct") # fallback
66
  HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
@@ -238,102 +235,8 @@ def _mdsi_block():
238
  "outcomes_summary": outcomes
239
  }, indent=2)
240
 
241
- # ---------- Dynamic Phase 1 question generator ----------
242
- def _extract_present_domains(artifacts: List[Dict[str, Any]]) -> Dict[str, bool]:
243
- flags = dict(population=False, cost=False, clinical=False, capacity=False)
244
- for a in artifacts or []:
245
- name = (a.get("name") or "").lower()
246
- cols = [c.lower() for c in (a.get("columns") or [])]
247
- if any(k in name for k in ["population", "census", "membership"]) or any(
248
- k in ",".join(cols) for k in ["population", "census", "residence", "settlement", "age"]
249
- ):
250
- flags["population"] = True
251
- if any(k in name for k in ["cost", "finance", "budget"]) or any(
252
- k in ",".join(cols) for k in ["cost", "startup", "ongoing", "per_client", "per-visit"]
253
- ):
254
- flags["cost"] = True
255
- if any(k in name for k in ["a1c", "outcome", "bp", "chol"]) or any(
256
- k in ",".join(cols) for k in ["a1c", "bmi", "bp", "chol", "outcome"]
257
- ):
258
- flags["clinical"] = True
259
- if any(k in name for k in ["ops", "capacity", "throughput", "volume"]) or any(
260
- k in ",".join(cols) for k in ["clients_per_day", "teams", "visits", "throughput"]
261
- ):
262
- flags["capacity"] = True
263
- return flags
264
-
265
- def _domain_from_text(text: str) -> Dict[str, bool]:
266
- t = (text or "").lower()
267
- return {
268
- "population": any(k in t for k in ["population", "census", "settlement", "membership"]),
269
- "cost": any(k in t for k in ["cost", "budget", "startup", "per client", "per-client", "ongoing"]),
270
- "clinical": any(k in t for k in ["a1c", "bmi", "blood pressure", "bp", "cholesterol", "outcome"]),
271
- "capacity": any(k in t for k in ["capacity", "throughput", "clients per day", "teams", "screen", "volume"]),
272
- }
273
-
274
- def _is_mdsi_diabetes(text: str) -> bool:
275
- t = (text or "").lower()
276
- return any(k in t for k in ["mdsi", "mobile diabetes", "diabetes", "metabolic", "a1c", "metis"])
277
-
278
- def build_dynamic_clarifications(scenario_text: str, artifacts: List[Dict[str, Any]]) -> str:
279
- flags_from_files = _extract_present_domains(artifacts)
280
- flags_from_text = _domain_from_text(scenario_text)
281
- missing = {
282
- k: not (flags_from_files.get(k) or flags_from_text.get(k))
283
- for k in ["population", "capacity", "cost", "clinical"]
284
- }
285
-
286
- qs: List[Tuple[str, str]] = []
287
- is_mdsi = _is_mdsi_diabetes(scenario_text)
288
-
289
- if missing["population"]:
290
- qs.append((
291
- "Prioritization",
292
- "Which population/risk indicators should drive prioritization (size, prevalence, access, equity factors)?"
293
- if not is_mdsi else
294
- "Confirm prioritization inputs: settlement membership living on-settlement (latest), obesity/metabolic syndrome prevalence, and any access-to-care constraints to weigh."
295
- ))
296
-
297
- if missing["capacity"]:
298
- qs.append((
299
- "Capacity",
300
- "What per-team throughput and operating schedule should be used for capacity calculations?"
301
- if not is_mdsi else
302
- "What is the realistic per-team screening rate (clients/day) and operating schedule (days/week, weeks/3-month window)?"
303
- ))
304
-
305
- if missing["cost"]:
306
- qs.append((
307
- "Cost",
308
- "Provide fixed setup costs and variable cost per client to model total program spend."
309
- if not is_mdsi else
310
- "Provide startup cost per client and ongoing cost per client/visit (or total program costs) to price scenarios like 1,200 screens."
311
- ))
312
-
313
- if missing["clinical"]:
314
- qs.append((
315
- "Clinical",
316
- "Which clinical indicators and expected effect sizes should be tracked for outcomes?"
317
- if not is_mdsi else
318
- "What longitudinal deltas should we expect (e.g., ΔA1c, ΔBP, ΔBMI, lipids) from repeat screenings, and over what interval?"
319
- ))
320
-
321
- qs.append((
322
- "Recommendations",
323
- "Any operational constraints (scheduling, staffing, partnerships) we should incorporate into deployment modeling?"
324
- if not is_mdsi else
325
- "Are there community constraints (events/seasonality/cultural protocols) that should shape routing and visit cadence?"
326
- ))
327
-
328
- qs = qs[:5]
329
- out = ["**Clarification Questions**"]
330
- current_group = None
331
- for grp, q in qs:
332
- if grp != current_group:
333
- out.append(f"\n**{grp}:**")
334
- current_group = grp
335
- out.append(f"- {q}")
336
- return "\n".join(out)
337
 
338
  # ---------- Core chat logic (auto scenario, dynamic Phase 1) ----------
339
  def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answers=False):
@@ -349,6 +252,7 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
349
  ans = "I am ClarityOps, your strategic decision making AI partner."
350
  return history + [(user_msg, ans)], awaiting_answers
351
 
 
352
  artifacts = []
353
  if uploaded_files_paths:
354
  ing = extract_text_from_files(uploaded_files_paths)
@@ -358,16 +262,24 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
358
  _session_rag.add_docs(chunks)
359
  if artifacts:
360
  _session_rag.register_artifacts(artifacts)
361
- log_event("uploads_added", None, {"chunks": len(chunks), "artifacts": len(artifacts)})
 
 
 
 
 
362
 
 
363
  if re.search(r"\b(columns?|headers?)\b", (safe_in or "").lower()):
364
  cols = _session_rag.get_latest_csv_columns()
365
  if cols:
366
  return history + [(user_msg, "Here are the column names from your most recent CSV upload:\n\n- " + "\n- ".join(cols))], awaiting_answers
367
 
 
368
  scenario_mode = is_scenario_triggered(safe_in, uploaded_files_paths)
369
 
370
  if not scenario_mode:
 
371
  out = cohere_chat(safe_in, history) if USE_HOSTED_COHERE else None
372
  if not out:
373
  model, tokenizer = load_local_model()
@@ -390,8 +302,13 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
390
  })
391
  return history + [(user_msg, safe_out)], awaiting_answers
392
 
 
 
 
 
393
  if not awaiting_answers:
394
- phase1 = build_dynamic_clarifications(scenario_text=safe_in, artifacts=artifacts or _session_rag.artifacts)
 
395
  phase1 = _sanitize_text(phase1)
396
  log_event("assistant_reply", None, {
397
  **hash_summary("prompt", safe_in if not PERSIST_CONTENT else ""),
@@ -401,11 +318,23 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
401
  })
402
  return history + [(user_msg, phase1)], True
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  session_snips = "\n---\n".join(_session_rag.retrieve(
405
  "diabetes screening Indigenous Métis mobile program cost throughput outcomes logistics",
406
  k=6
407
  ))
408
-
409
  snapshot = _load_snapshot()
410
  policy_context = retrieve_context(
411
  "mobile diabetes screening Indigenous community outreach cultural safety data governance outcomes"
@@ -413,34 +342,29 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
413
  computed = compute_operational_numbers(snapshot)
414
 
415
  user_lower = (safe_in or "").lower()
416
- mdsi_extra = _mdsi_block() if ("diabetes" in user_lower or "mdsi" in user_lower or "mobile screening" in user_lower) else ""
417
-
418
- arts = _session_rag.artifacts or []
419
- if arts:
420
- arts_summ = []
421
- for a in arts:
422
- nm = a.get("name") or "<unnamed>"
423
- cols = ", ".join(a.get("columns") or [])[:600]
424
- rows = a.get("n_rows_sampled") or 0
425
- arts_summ.append(f"- {nm}: columns[{cols}] sample_rows={rows}")
426
- artifact_block = "Uploaded Data Files (summarized):\n" + "\n".join(arts_summ)
427
- else:
428
- artifact_block = "Uploaded Data Files (summarized):\n- <none>"
429
 
430
  scenario_block = safe_in if len((safe_in or "")) > 0 else ""
431
  system_preamble = build_system_preamble(
432
  snapshot=snapshot,
433
  policy_context=policy_context,
434
  computed_numbers=computed,
435
- scenario_text=scenario_block + f"\n\n{artifact_block}" + (f"\n\nExecutive Pre-Computed Blocks:\n{mdsi_extra}" if mdsi_extra else ""),
436
  session_snips=session_snips
437
  )
438
 
439
  directive = (
440
  "\n\n[INSTRUCTION TO MODEL]\n"
441
- "Produce **Phase 2** only now: start with 'Structured Analysis' and follow the exact section order "
442
  "(Prioritization, Capacity, Cost, Clinical Benefits, ClarityOps Top 3 Recommendations). "
443
- "Use uploaded files + the user's latest answers as authoritative. Show calculations, units, and a brief Provenance.\n"
 
444
  )
445
 
446
  augmented_user = SYSTEM_MASTER + "\n\n" + system_preamble + "\n\nUser scenario & answers:\n" + safe_in + directive
@@ -481,7 +405,7 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
481
  # ---------- Theme & CSS ----------
482
  theme = gr.themes.Soft(primary_hue="teal", neutral_hue="slate", radius_size=gr.themes.sizes.radius_lg)
483
  custom_css = """
484
- :root { --brand-bg: #0f172a; --brand-accent: #0d9488; --brand-text: #0f172a; --brand-text-light: #ffffff; } /* CHANGED bg only */
485
  html, body, .gradio-container { height: 100vh; }
486
  .gradio-container { background: var(--brand-bg); display: flex; flex-direction: column; }
487
 
@@ -605,6 +529,8 @@ with gr.Blocks(theme=theme, css=custom_css, analytics_enabled=False) as demo:
605
  concurrency_limit=2, queue=True)
606
 
607
  def _on_clear():
 
 
608
  return (
609
  [], "", [], False,
610
  gr.update(visible=True),
 
47
 
48
  from safety import safety_filter, refusal_reply
49
  from retriever import init_retriever, retrieve_context
50
+ from decision_math import compute_operational_numbers # fixed import name
 
 
 
 
 
 
 
 
51
  from prompt_templates import build_system_preamble
52
  from upload_ingest import extract_text_from_files
53
  from session_rag import SessionRAG
54
  from mdsi_analysis import capacity_projection, cost_estimate, outcomes_summary
55
 
56
+ # NEW: dynamic data plumbing
57
+ from data_registry import DataRegistry
58
+ from schema_mapper import map_concepts, build_phase1_questions
59
+ from auto_metrics import build_data_findings_markdown
60
+
61
  # ---------- Config ----------
62
  MODEL_ID = os.getenv("MODEL_ID", "microsoft/Phi-3-mini-4k-instruct") # fallback
63
  HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
 
235
  "outcomes_summary": outcomes
236
  }, indent=2)
237
 
238
+ # NEW: session-scoped data registry
239
+ _data_registry = DataRegistry()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  # ---------- Core chat logic (auto scenario, dynamic Phase 1) ----------
242
  def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answers=False):
 
252
  ans = "I am ClarityOps, your strategic decision making AI partner."
253
  return history + [(user_msg, ans)], awaiting_answers
254
 
255
+ # 1) Ingest uploads into RAG AND DataRegistry (files alone can trigger Scenario Mode)
256
  artifacts = []
257
  if uploaded_files_paths:
258
  ing = extract_text_from_files(uploaded_files_paths)
 
262
  _session_rag.add_docs(chunks)
263
  if artifacts:
264
  _session_rag.register_artifacts(artifacts)
265
+ # register parsable tables into DataRegistry
266
+ for p in uploaded_files_paths:
267
+ _data_registry.add_path(p)
268
+ log_event("uploads_added", None, {
269
+ "chunks": len(chunks), "artifacts": len(artifacts), "tables": len(_data_registry.names())
270
+ })
271
 
272
+ # quick helper
273
  if re.search(r"\b(columns?|headers?)\b", (safe_in or "").lower()):
274
  cols = _session_rag.get_latest_csv_columns()
275
  if cols:
276
  return history + [(user_msg, "Here are the column names from your most recent CSV upload:\n\n- " + "\n- ".join(cols))], awaiting_answers
277
 
278
+ # 2) Decide mode
279
  scenario_mode = is_scenario_triggered(safe_in, uploaded_files_paths)
280
 
281
  if not scenario_mode:
282
+ # ---------- Normal conversational chat ----------
283
  out = cohere_chat(safe_in, history) if USE_HOSTED_COHERE else None
284
  if not out:
285
  model, tokenizer = load_local_model()
 
302
  })
303
  return history + [(user_msg, safe_out)], awaiting_answers
304
 
305
+ # ---------- Scenario Mode ----------
306
+ # 3) Build dynamic concept mapping from scenario + data
307
+ mapping = map_concepts(safe_in, _data_registry)
308
+
309
  if not awaiting_answers:
310
+ # PHASE 1: ask only for missing/ambiguous
311
+ phase1 = build_phase1_questions(scenario_text=safe_in, registry=_data_registry, mapping=mapping)
312
  phase1 = _sanitize_text(phase1)
313
  log_event("assistant_reply", None, {
314
  **hash_summary("prompt", safe_in if not PERSIST_CONTENT else ""),
 
318
  })
319
  return history + [(user_msg, phase1)], True
320
 
321
+ # PHASE 2: compute data findings in Python, then let LLM write the narrative
322
+ data_findings_md, missing_keys = build_data_findings_markdown(_data_registry, mapping)
323
+
324
+ # If critical missing items remain, surface INSUFFICIENT_DATA context to the model + ask for the rest
325
+ insuff_note = ""
326
+ if missing_keys:
327
+ insuff_note = (
328
+ "\n\nUncomputable (still missing columns/defs): "
329
+ + ", ".join(sorted(set(missing_keys)))
330
+ + ". If any of these are essential to the requested outputs, write INSUFFICIENT_DATA where appropriate."
331
+ )
332
+
333
+ # Preamble context (snapshot + policy)
334
  session_snips = "\n---\n".join(_session_rag.retrieve(
335
  "diabetes screening Indigenous Métis mobile program cost throughput outcomes logistics",
336
  k=6
337
  ))
 
338
  snapshot = _load_snapshot()
339
  policy_context = retrieve_context(
340
  "mobile diabetes screening Indigenous community outreach cultural safety data governance outcomes"
 
342
  computed = compute_operational_numbers(snapshot)
343
 
344
  user_lower = (safe_in or "").lower()
345
+ mdsi_extra = ""
346
+ if any(k in user_lower for k in ["diabetes", "mdsi", "mobile screening"]):
347
+ mdsi_extra = _mdsi_block()
348
+
349
+ # Build artifact + table summary for the prompt
350
+ registry_summary = _data_registry.summarize_for_prompt()
351
+ artifact_block = "Uploaded Data Files (tables):\n" + registry_summary
 
 
 
 
 
 
352
 
353
  scenario_block = safe_in if len((safe_in or "")) > 0 else ""
354
  system_preamble = build_system_preamble(
355
  snapshot=snapshot,
356
  policy_context=policy_context,
357
  computed_numbers=computed,
358
+ scenario_text=scenario_block + f"\n\n{artifact_block}\n\n{data_findings_md}" + (f"\n\nExecutive Pre-Computed Blocks:\n{mdsi_extra}" if mdsi_extra else "") + insuff_note,
359
  session_snips=session_snips
360
  )
361
 
362
  directive = (
363
  "\n\n[INSTRUCTION TO MODEL]\n"
364
+ "Produce **Phase 2** now: begin with 'Structured Analysis' and follow the exact section order "
365
  "(Prioritization, Capacity, Cost, Clinical Benefits, ClarityOps Top 3 Recommendations). "
366
+ "Use the **Python-computed tables** in the context as ground truth; when something is truly missing, write INSUFFICIENT_DATA. "
367
+ "Show calculations, units, and add a brief Provenance.\n"
368
  )
369
 
370
  augmented_user = SYSTEM_MASTER + "\n\n" + system_preamble + "\n\nUser scenario & answers:\n" + safe_in + directive
 
405
  # ---------- Theme & CSS ----------
406
  theme = gr.themes.Soft(primary_hue="teal", neutral_hue="slate", radius_size=gr.themes.sizes.radius_lg)
407
  custom_css = """
408
+ :root { --brand-bg: #0f172a; --brand-accent: #0d9488; --brand-text: #0f172a; --brand-text-light: #ffffff; } /* bg same as chat for integrated look */
409
  html, body, .gradio-container { height: 100vh; }
410
  .gradio-container { background: var(--brand-bg); display: flex; flex-direction: column; }
411
 
 
529
  concurrency_limit=2, queue=True)
530
 
531
  def _on_clear():
532
+ # Also clear the in-memory data registry for a fresh scenario
533
+ _data_registry.clear()
534
  return (
535
  [], "", [], False,
536
  gr.update(visible=True),