Rajan Sharma commited on
Commit
4f1d205
·
verified ·
1 Parent(s): f0584a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -245
app.py CHANGED
@@ -47,12 +47,15 @@ from huggingface_hub import login
47
 
48
  from safety import safety_filter, refusal_reply
49
  from retriever import init_retriever, retrieve_context
50
- from decision_math import compute_operitional_numbers as compute_operational_numbers # in case of rename
 
 
51
  try:
52
- # prefer the original name if present
53
- from decision_math import compute_operational_numbers as compute_operational_numbers
54
  except Exception:
55
- pass
 
 
56
  from prompt_templates import build_system_preamble
57
  from upload_ingest import extract_text_from_files
58
  from session_rag import SessionRAG
@@ -235,237 +238,6 @@ def _mdsi_block():
235
  "outcomes_summary": outcomes
236
  }, indent=2)
237
 
238
- # ---------- Scenario-agnostic dynamic analytics (multi-file) ----------
239
- import pandas as _pd
240
- from collections import Counter
241
- import itertools as _it
242
- import numpy as _np
243
-
244
- _NUM_SAMPLE_ROWS = 50000 # cap per file for speed
245
-
246
- def _read_table(path: str) -> _pd.DataFrame:
247
- try:
248
- if path.lower().endswith((".xlsx", ".xls")):
249
- return _pd.read_excel(path)
250
- return _pd.read_csv(path, low_memory=False, nrows=_NUM_SAMPLE_ROWS)
251
- except Exception:
252
- return _pd.DataFrame()
253
-
254
- def _profile_schema(df: _pd.DataFrame) -> Dict[str, Any]:
255
- if df.empty:
256
- return {"numeric": [], "categorical": [], "datetime": [], "textlike": []}
257
- numeric, categorical, datetime, textlike = [], [], [], []
258
- for c in df.columns:
259
- s = df[c]
260
- if _pd.api.types.is_numeric_dtype(s):
261
- numeric.append(c)
262
- elif _pd.api.types.is_datetime64_any_dtype(s):
263
- datetime.append(c)
264
- else:
265
- uniq = s.astype(str).nunique(dropna=True)
266
- if uniq <= max(50, int(0.03 * max(1, len(s)))):
267
- categorical.append(c)
268
- else:
269
- textlike.append(c)
270
- return {"numeric": numeric, "categorical": categorical, "datetime": datetime, "textlike": textlike}
271
-
272
- def _safe_num(s: _pd.Series) -> _pd.Series:
273
- if not _pd.api.types.is_numeric_dtype(s):
274
- return _pd.to_numeric(s, errors="coerce")
275
- return s
276
-
277
- def _top_numeric_by_variance(df: _pd.DataFrame, numeric_cols: List[str], k=5) -> List[str]:
278
- scores = []
279
- for c in numeric_cols:
280
- x = _safe_num(df[c])
281
- try:
282
- scores.append((c, _np.nanvar(x.values)))
283
- except Exception:
284
- scores.append((c, _np.nan))
285
- scores.sort(key=lambda t: (t[1] if _np.isfinite(t[1]) else -1), reverse=True)
286
- return [c for c, _ in scores[:k]]
287
-
288
- def _top_categories(df: _pd.DataFrame, cat_cols: List[str], k=3) -> Dict[str, List[Tuple[str,int]]]:
289
- out = {}
290
- for c in cat_cols[:6]:
291
- vc = Counter(df[c].astype(str).fillna("<NA>")).most_common(k)
292
- out[c] = vc
293
- return out
294
-
295
- def _infer_candidate_keys(dfs_named: List[Tuple[str, _pd.DataFrame]]) -> List[str]:
296
- all_cols = []
297
- for name, df in dfs_named:
298
- all_cols.extend(list(map(str, df.columns)))
299
- counts = Counter([c.strip() for c in all_cols])
300
- bad = set(["value","values","count","total","sum","mean","median","date","timestamp","index"])
301
- return [c for c, n in counts.items() if n >= 2 and c.lower() not in bad]
302
-
303
- def _try_joins(dfs_named: List[Tuple[str, _pd.DataFrame]], keys: List[str], max_pairs=3) -> List[str]:
304
- previews = []
305
- pairs = list(_it.combinations(range(len(dfs_named)), 2))
306
- shown = 0
307
- for i, j in pairs:
308
- if shown >= max_pairs:
309
- break
310
- name_i, dfi = dfs_named[i]
311
- name_j, dfj = dfs_named[j]
312
- for k in keys:
313
- if k in dfi.columns and k in dfj.columns:
314
- try:
315
- merged = dfi[[k]].dropna().merge(dfj[[k]].dropna(), on=k, how="inner")
316
- previews.append(f"- Join {name_i} ↔ {name_j} on `{k}` → matches: {len(merged):,}")
317
- shown += 1
318
- if shown >= max_pairs:
319
- break
320
- except Exception:
321
- continue
322
- return previews
323
-
324
- def _scenario_tokens(text: str) -> List[str]:
325
- t = (text or "").lower()
326
- t = re.sub(r"[^a-z0-9_ -]+", " ", t)
327
- toks = [w for w in t.split() if len(w) >= 3]
328
- out, seen = [], set()
329
- for w in toks:
330
- if w not in seen:
331
- seen.add(w); out.append(w)
332
- return out
333
-
334
- def _extract_intents(text: str) -> Dict[str, Any]:
335
- toks = _scenario_tokens(text)
336
- intents = {
337
- "rank": any(x in toks for x in ["rank","top","longest","highest","lowest","shortest","worst","best"]),
338
- "agg_words": [w for w in toks if w in set(["mean","average","avg","median","p50","p90","sum","total"])],
339
- "n_top": 5
340
- }
341
- return intents
342
-
343
- def _pick_dims_from_tokens(df: _pd.DataFrame, cat_cols: List[str], toks: List[str]) -> List[str]:
344
- scored = []
345
- for c in cat_cols:
346
- score = sum(1 for t in toks if t in c.lower())
347
- scored.append((score, c))
348
- scored.sort(key=lambda t: (t[0], -len(t[1])), reverse=True)
349
- picked = [c for s, c in scored if s > 0][:3]
350
- if not picked:
351
- picked = cat_cols[:3]
352
- return picked
353
-
354
- def _pick_metrics_from_tokens(df: _pd.DataFrame, num_cols: List[str], toks: List[str]) -> List[str]:
355
- scored = []
356
- for c in num_cols:
357
- score = sum(1 for t in toks if t in c.lower())
358
- scored.append((score, c))
359
- scored.sort(key=lambda t: (t[0], -len(t[1])), reverse=True)
360
- picked = [c for s, c in scored if s > 0][:3]
361
- if not picked:
362
- picked = _top_numeric_by_variance(df, num_cols, k=3)
363
- return picked
364
-
365
- def _mk_table(md_title: str, df: _pd.DataFrame, limit=10) -> str:
366
- if df.empty: return ""
367
- return f"{md_title}\n" + df.head(limit).to_markdown(index=False)
368
-
369
- def compute_dynamic_analytics_block(arts: List[Dict[str, Any]], scenario_text: str) -> str:
370
- dfs_named: List[Tuple[str, _pd.DataFrame]] = []
371
- for a in arts or []:
372
- p = a.get("path"); n = a.get("name") or "table"
373
- if not p: continue
374
- if not str(p).lower().endswith((".csv",".xlsx",".xls")): continue
375
- d = _read_table(p)
376
- if d.empty: continue
377
- d = d.copy()
378
- d.columns = [str(c).strip().replace("\n"," ").replace("\r"," ") for c in d.columns]
379
- dfs_named.append((n, d))
380
-
381
- if not dfs_named:
382
- return ""
383
-
384
- overview_rows = []
385
- for n, d in dfs_named:
386
- overview_rows.append({"File": n, "Rows": len(d), "Columns": d.shape[1]})
387
- overview_md = _pd.DataFrame(overview_rows).to_markdown(index=False)
388
-
389
- per_table_blocks = []
390
- toks = _scenario_tokens(scenario_text)
391
- intents = _extract_intents(scenario_text)
392
-
393
- for n, d in dfs_named:
394
- prof = _profile_schema(d)
395
- num_cols = prof["numeric"]
396
- cat_cols = prof["categorical"]
397
-
398
- top_num = _top_numeric_by_variance(d, num_cols, k=5) if num_cols else []
399
- num_sum = _pd.DataFrame()
400
- if top_num:
401
- stat_rows = []
402
- for c in top_num:
403
- x = _safe_num(d[c])
404
- try_mean = float(_np.nanmean(x)) if x.size else _np.nan
405
- try_median = float(_np.nanmedian(x)) if x.size else _np.nan
406
- try_p90 = float(_np.nanpercentile(x.dropna(), 90)) if x.dropna().size else _np.nan
407
- stat_rows.append({
408
- "Metric": c,
409
- "count": int(x.count()),
410
- "mean": try_mean,
411
- "median": try_median,
412
- "p90": try_p90
413
- })
414
- num_sum = _pd.DataFrame(stat_rows)
415
-
416
- cat_info = _top_categories(d, cat_cols, k=5) if cat_cols else {}
417
- cat_md = []
418
- for c, vc in cat_info.items():
419
- parts = ", ".join([f"{val} ({cnt})" for val, cnt in vc])
420
- cat_md.append(f"- {c}: {parts}")
421
-
422
- rank_tables = []
423
- if intents.get("rank") and num_cols and cat_cols:
424
- dims = _pick_dims_from_tokens(d, cat_cols, toks)
425
- mets = _pick_metrics_from_tokens(d, num_cols, toks)
426
- for gcol in dims[:2]:
427
- for mcol in mets[:2]:
428
- try:
429
- g = (
430
- d.groupby(gcol, as_index=False)[mcol]
431
- .mean(numeric_only=True)
432
- .rename(columns={mcol: f"avg({mcol})"})
433
- .sort_values(f"avg({mcol})", ascending=False)
434
- .head(intents["n_top"])
435
- )
436
- rank_tables.append(_mk_table(f"Top {intents['n_top']} by avg({mcol}) — grouped by {gcol}:", g))
437
- except Exception:
438
- continue
439
-
440
- block_parts = [f"### {n}"]
441
- if not num_sum.empty:
442
- block_parts.append(_mk_table("Numeric summary (top-variance metrics):", num_sum))
443
- if cat_md:
444
- block_parts.append("Top categories:\n" + "\n".join(cat_md))
445
- for rt in rank_tables:
446
- if rt: block_parts.append(rt)
447
-
448
- per_table_blocks.append("\n\n".join([p for p in block_parts if p]))
449
-
450
- keys = _infer_candidate_keys(dfs_named)
451
- join_md = ""
452
- if keys:
453
- joins = _try_joins(dfs_named, keys, max_pairs=3)
454
- if joins:
455
- join_md = "Join previews:\n" + "\n".join(joins)
456
-
457
- parts = [
458
- "Computed Analytics Block (auto-generated, scenario-agnostic):",
459
- "",
460
- "Dataset overview:",
461
- overview_md,
462
- "",
463
- "\n\n".join(per_table_blocks)
464
- ]
465
- if join_md:
466
- parts.extend(["", join_md])
467
- return "\n".join(parts)
468
-
469
  # ---------- Dynamic Phase 1 question generator ----------
470
  def _extract_present_domains(artifacts: List[Dict[str, Any]]) -> Dict[str, bool]:
471
  flags = dict(population=False, cost=False, clinical=False, capacity=False)
@@ -629,7 +401,6 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
629
  })
630
  return history + [(user_msg, phase1)], True
631
 
632
- # ---------- Phase 2 ----------
633
  session_snips = "\n---\n".join(_session_rag.retrieve(
634
  "diabetes screening Indigenous Métis mobile program cost throughput outcomes logistics",
635
  k=6
@@ -656,20 +427,12 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
656
  else:
657
  artifact_block = "Uploaded Data Files (summarized):\n- <none>"
658
 
659
- # NEW: scenario-agnostic, multi-file analytics block
660
- analytics_block = compute_dynamic_analytics_block(arts, safe_in)
661
-
662
  scenario_block = safe_in if len((safe_in or "")) > 0 else ""
663
  system_preamble = build_system_preamble(
664
  snapshot=snapshot,
665
  policy_context=policy_context,
666
  computed_numbers=computed,
667
- scenario_text=(
668
- scenario_block
669
- + f"\n\n{artifact_block}"
670
- + (f"\n\n{analytics_block}" if analytics_block else "")
671
- + (f"\n\nExecutive Pre-Computed Blocks:\n{mdsi_extra}" if mdsi_extra else "")
672
- ),
673
  session_snips=session_snips
674
  )
675
 
@@ -857,3 +620,4 @@ if __name__ == "__main__":
857
 
858
 
859
 
 
 
47
 
48
  from safety import safety_filter, refusal_reply
49
  from retriever import init_retriever, retrieve_context
50
+
51
+ # ---------- Snapshot & retrieval helpers import ----------
52
+ # Use the real function if present; otherwise fall back to a harmless no-op.
53
  try:
54
+ from decision_math import compute_operational_numbers
 
55
  except Exception:
56
+ def compute_operational_numbers(snapshot: dict) -> dict:
57
+ return {}
58
+
59
  from prompt_templates import build_system_preamble
60
  from upload_ingest import extract_text_from_files
61
  from session_rag import SessionRAG
 
238
  "outcomes_summary": outcomes
239
  }, indent=2)
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  # ---------- Dynamic Phase 1 question generator ----------
242
  def _extract_present_domains(artifacts: List[Dict[str, Any]]) -> Dict[str, bool]:
243
  flags = dict(population=False, cost=False, clinical=False, capacity=False)
 
401
  })
402
  return history + [(user_msg, phase1)], True
403
 
 
404
  session_snips = "\n---\n".join(_session_rag.retrieve(
405
  "diabetes screening Indigenous Métis mobile program cost throughput outcomes logistics",
406
  k=6
 
427
  else:
428
  artifact_block = "Uploaded Data Files (summarized):\n- <none>"
429
 
 
 
 
430
  scenario_block = safe_in if len((safe_in or "")) > 0 else ""
431
  system_preamble = build_system_preamble(
432
  snapshot=snapshot,
433
  policy_context=policy_context,
434
  computed_numbers=computed,
435
+ scenario_text=scenario_block + f"\n\n{artifact_block}" + (f"\n\nExecutive Pre-Computed Blocks:\n{mdsi_extra}" if mdsi_extra else ""),
 
 
 
 
 
436
  session_snips=session_snips
437
  )
438
 
 
620
 
621
 
622
 
623
+