Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 28

Commit

5651d3e

verified ·

1 Parent(s): 87088be

Update narrative_safetynet.py

Browse files

Files changed (1) hide show

narrative_safetynet.py +93 -222

narrative_safetynet.py CHANGED Viewed

@@ -1,262 +1,133 @@
 # narrative_safetynet.py
 from __future__ import annotations
-from typing import Dict, Any, List, Optional
 import math
 import numpy as np
 import pandas as pd
-import re
-_DEF_MIN_SAMPLE = 5  # threshold for "interpret with caution" (fully generic)
-def _is_numeric(s: pd.Series) -> bool:
-    return pd.api.types.is_numeric_dtype(s)
 def _fmt_num(x: Any, decimals: int = 1) -> str:
     try:
         if x is None or (isinstance(x, float) and math.isnan(x)):
             return "n/a"
-        if isinstance(x, (int, np.integer)):
-            return f"{x:,}"
         return f"{float(x):,.{decimals}f}"
     except Exception:
         return str(x)
-def _pick_numeric(df: pd.DataFrame, hints: List[str]) -> Optional[str]:
-    # choose a numeric column; prefer hinted names
-    cols = list(df.columns)
     for h in hints:
-        for c in cols:
-            if h.lower() in c.lower() and _is_numeric(df[c]):
-                return c
-    for c in cols:
-        if _is_numeric(df[c]):
-            return c
-    return None
-def _find_group_col(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
     cols = list(df.columns)
     for cand in candidates:
         for c in cols:
-            if cand.lower() in c.lower():
                 return c
-    # fallback: first reasonable categorical column
     obj_cols = [c for c in cols if df[c].dtype == "object"]
     for c in obj_cols:
         nuniq = df[c].nunique(dropna=True)
-        if 1 < nuniq < max(50, len(df) // 10):
             return c
     return None
-def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
-    dff = df.copy()
-    for c in dff.columns:
-        if dff[c].dtype == "object":
-            dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
-    return dff
-def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
-    return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
-def _deviation_label(x: float, mu: float, tol: float = 0.01) -> str:
-    if np.isnan(x) or np.isnan(mu) or mu == 0:
         return "unknown"
     rel = (x - mu) / mu
-    if rel > 0.05:
         return "higher than average"
-    if rel < -0.05:
         return "lower than average"
-    if abs(rel) <= max(tol, 0.05):
-        return "about average"
     return "about average"
-def _pluralize(label: str, n: int) -> str:
-    return f"{label}{'' if n==1 else 's'}"
-def build_narrative(
-    scenario_text: str,
-    datasets: Dict[str, Any],
-    structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
-    metric_hints: Optional[List[str]] = None,
-    group_hints: Optional[List[str]] = None,
-    min_sample: int = _DEF_MIN_SAMPLE
-) -> str:
-    """
-    Scenario-agnostic narrative fallback:
-      - Picks numeric metric & groupings dynamically
-      - Computes overall baseline + deviations
-      - Warns on small samples
-      - Optional geographic notes if city/lat/lon exist
-    """
-    metric_hints = metric_hints or ["surgery_median", "consult_median", "wait", "median", "p90", "90th"]
-    group_hints  = group_hints  or ["facility", "specialty", "zone", "hospital", "city", "region"]
-    # 1) choose first non-empty table-like dataset
-    df = None
-    df_key = None
-    for k, v in datasets.items():
-        if isinstance(v, pd.DataFrame) and not v.empty:
-            df = _nanlike_to_nan(v)
-            df_key = k
-            break
-    if df is None:
-        return "No tabular data available. Unable to generate a narrative."
-    # 2) metrics
-    primary_metric = _pick_numeric(df, metric_hints)   # e.g., Surgery_Median
-    if not primary_metric:
-        return "No numeric metric found to summarize; please ensure at least one numeric wait-time column is present."
-    other_numeric = [c for c in df.columns if _is_numeric(df[c]) and c != primary_metric]
-    comparator_metric = next(
-        (c for c in other_numeric if any(h in c.lower() for h in ["consult", "wait", "median", "p90", "90th"])),
-        None
-    )
-    # 3) groups
-    group1 = _find_group_col(df, group_hints)  # e.g., Facility
-    group2 = None
-    if group1:
-        alt_hints = [h for h in group_hints if h.lower() not in group1.lower()]
-        group2 = _find_group_col(df.drop(columns=[group1], errors="ignore"), alt_hints)
-    # 4) baseline + grouped
-    baseline = pd.to_numeric(df[primary_metric], errors="coerce").mean(skipna=True)
-    def _group_stats(col: str) -> Optional[pd.DataFrame]:
-        if not col:
-            return None
-        tmp = df.copy()
-        tmp[primary_metric] = pd.to_numeric(tmp[primary_metric], errors="coerce")
-        comp_col = comparator_metric or primary_metric
-        if comp_col in tmp.columns:
-            tmp[comp_col] = pd.to_numeric(tmp[comp_col], errors="coerce")
-        agg = (
-            tmp.groupby(col, dropna=False)
-               .agg(
-                   metric=(primary_metric, "mean"),
-                   count=(primary_metric, "count"),
-                   comp=(comp_col, "mean") if comp_col in tmp.columns else (primary_metric, "mean"),
-               )
-               .reset_index()
-        )
-        return agg
-    g1 = _group_stats(group1)
-    g2 = _group_stats(group2)
-    # 5) Top groups (by primary metric) from group1
-    top_lines: List[str] = []
-    if isinstance(g1, pd.DataFrame) and not g1.empty:
-        g1 = g1.sort_values(by="metric", ascending=False)
-        k = min(5, len(g1))
-        for i, row in enumerate(g1.head(k).itertuples(index=False), 1):
-            label = getattr(row, group1)
-            metric = getattr(row, "metric")
-            comp   = getattr(row, "comp")
-            cnt    = getattr(row, "count")
-            devlab = _deviation_label(metric, baseline)
-            caution = _small_sample_note(int(cnt), min_sample)
-            msg = f"{i}. **{label}** — {primary_metric}: {_fmt_num(metric)}"
-            if comparator_metric:
-                msg += f"; {comparator_metric}: {_fmt_num(comp)}"
-            msg += f"; {_pluralize('record', int(cnt))}: {cnt}"
-            msg += f" → {devlab}"
-            if caution:
-                msg += f" ({caution})"
-            top_lines.append(msg)
-    # 6) Group2 overview
-    region_lines: List[str] = []
-    if isinstance(g2, pd.DataFrame) and not g2.empty:
-        g2 = g2.sort_values(by="metric", ascending=False)
-        for row in g2.itertuples(index=False):
-            label = getattr(row, group2)
-            metric = getattr(row, "metric")
-            comp   = getattr(row, "comp")
-            cnt    = getattr(row, "count")
-            devlab = _deviation_label(metric, baseline)
-            caution = _small_sample_note(int(cnt), min_sample)
-            line = f"- **{label}**: {_fmt_num(metric)} (vs. overall {_fmt_num(baseline)} → {devlab}); n={cnt}"
-            if comparator_metric:
-                line += f"; {comparator_metric}: {_fmt_num(comp)}"
-            if caution:
-                line += f" — {caution}"
-            region_lines.append(line)
-    # 7) Geographic notes (optional)
-    geo_notes: List[str] = []
-    city_col = next((c for c in df.columns if re.search(r"\bcity\b", c, re.I)), None)
-    lat_col  = next((c for c in df.columns if re.search(r"\b(lat|latitude)\b", c, re.I)), None)
-    lon_col  = next((c for c in df.columns if re.search(r"\b(lon|longitude)\b", c, re.I)), None)
-    if group1 and city_col and (lat_col and lon_col):
-        if isinstance(g1, pd.DataFrame) and not g1.empty and group1 in df.columns:
-            top_labels = g1[group1].astype(str).head(10).tolist()
-            sub = df[df[group1].astype(str).isin(top_labels)].copy()
-            if not sub.empty:
-                sub[primary_metric] = pd.to_numeric(sub[primary_metric], errors="coerce")
-                by_city = (
-                    sub.groupby(city_col, dropna=False)[primary_metric]
-                       .mean()
-                       .reset_index()
-                       .sort_values(by=primary_metric, ascending=False)
-                )
-                for r in by_city.head(3).to_dict(orient="records"):
-                    cname = r.get(city_col)
-                    val = r.get(primary_metric)
-                    geo_notes.append(f"- **{cname}** shows higher average {primary_metric} among top groups ({_fmt_num(val)}).")
-    # 8) Methodology (auto)
-    methodology: List[str] = []
-    na_counts = df.isna().sum().sum()
-    if na_counts > 0:
-        methodology.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
-    methodology.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
-    if comparator_metric:
-        methodology.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
-    if group1:
-        methodology.append(f"Primary grouping inferred: **{group1}**.")
-    if group2:
-        methodology.append(f"Secondary grouping inferred: **{group2}**.")
-    if min_sample != _DEF_MIN_SAMPLE:
-        methodology.append(f"Small-sample threshold set to {min_sample} records.")
-    # 9) Compose markdown
-    lines: List[str] = []
-    lines.append("## Methodology (Auto-generated)")
-    for m in methodology:
-        lines.append(f"- {m}")
-    lines.append("")
-    if top_lines:
-        lines.append("## Highest average values by group")
-        lines.extend(top_lines)
-        lines.append("")
-    if region_lines:
-        lines.append(f"## {group2 or 'Region/Category'} comparison vs overall")
-        lines.extend(region_lines)
-        lines.append("")
-    if geo_notes:
-        lines.append("## Geographic notes")
-        lines.extend(geo_notes)
-        lines.append("")
-    recs: List[str] = []
-    if top_lines:
-        recs.append("Prioritize resources to the highest-average groups (above overall baseline), especially those with sufficient volume.")
-    if comparator_metric:
-        recs.append(f"Cross-check {comparator_metric} trends to identify upstream bottlenecks (e.g., long consult waits pushing surgery waits).")
-    if isinstance(g2, pd.DataFrame) and not g2.empty:
-        high = g2[g2["metric"] > baseline]
-        if not high.empty:
-            recs.append(f"Address disparities where average **{primary_metric}** exceeds the overall baseline.")
-    recs.append("For very small groups, validate data quality and consider pooling across similar categories to stabilize estimates.")
-    recs.append("Validate coding differences (similar specialties or labels spelled differently) to ensure apples-to-apples comparison.")
-    lines.append("## Recommendations (Auto-generated)")
-    for r in recs:
-        lines.append(f"- {r}")
-    return "\n".join(lines).strip()

 # narrative_safetynet.py
 from __future__ import annotations
+from typing import Dict, Any, List, Optional, Tuple
+import re
 import math
 import numpy as np
 import pandas as pd
+# -------------------- helpers: dtype / formatting --------------------
+_DEF_MIN_SAMPLE = 5  # generic caution threshold for group sizes
+_HINT_METRICS_DEFAULT = [
+    "surgery_median", "consult_median",
+    "surgery_90th", "consult_90th",
+    "surgery", "consult",
+    "wait", "median", "p90", "90th"
+]
+_HINT_GROUPS_DEFAULT = [
+    "facility", "specialty", "zone",
+    "hospital", "city", "region"
+]
+_BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"]
+def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
+    dff = df.copy()
+    for c in dff.columns:
+        if dff[c].dtype == "object":
+            dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
+    return dff
+def _is_numeric_series(s: pd.Series) -> bool:
+    try:
+        return pd.api.types.is_numeric_dtype(s)
+    except Exception:
+        return False
+def _to_numeric(s: pd.Series) -> pd.Series:
+    return pd.to_numeric(s, errors="coerce")
 def _fmt_num(x: Any, decimals: int = 1) -> str:
     try:
         if x is None or (isinstance(x, float) and math.isnan(x)):
             return "n/a"
+        if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()):
+            return f"{int(round(float(x))):,}"
         return f"{float(x):,.{decimals}f}"
     except Exception:
         return str(x)
+# -------------------- metric & dataset selection (dynamic) --------------------
+def _score_metric_name(col: str, hints: List[str]) -> int:
+    name = (col or "").lower()
+    if any(bad in name for bad in _BAD_METRIC_NAMES):
+        return -10**6  # disqualify obvious counters/ids
+    score = 0
     for h in hints:
+        if h in name:
+            score += 3
+    return score
+def _choose_df_and_metric(
+    datasets: Dict[str, Any],
+    metric_hints: List[str]
+) -> Optional[Tuple[str, pd.DataFrame, str]]:
+    """
+    Sweep all dataframes & numeric columns. Pick the (df, metric) with best score:
+      +3 per hint match; +1 if non-constant numeric. Disqualify id-like names.
+    """
+    best: Optional[Tuple[int, str, pd.DataFrame, str]] = None
+    for key, v in datasets.items():
+        if not isinstance(v, pd.DataFrame) or v.empty:
+            continue
+        df = _nanlike_to_nan(v)
+        for col in df.columns:
+            col_num = _to_numeric(df[col])
+            if not _is_numeric_series(col_num):
+                continue
+            s = _score_metric_name(col, metric_hints)
+            if col_num.nunique(dropna=True) > 1:
+                s += 1
+            if best is None or s > best[0]:
+                best = (s, key, df, col)
+    if best is None:
+        return None
+    _, key, df, metric = best
+    return key, df, metric
+# -------------------- grouping detection (dynamic) --------------------
+def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]:
+    avoid = [a.lower() for a in (avoid or [])]
     cols = list(df.columns)
+    # prefer name matches
     for cand in candidates:
         for c in cols:
+            cname = c.lower()
+            if cand.lower() in cname and all(a not in cname for a in avoid):
                 return c
+    # fallback: a categorical with reasonable cardinality
     obj_cols = [c for c in cols if df[c].dtype == "object"]
     for c in obj_cols:
         nuniq = df[c].nunique(dropna=True)
+        if 1 < nuniq < max(50, len(df)//10):
             return c
     return None
+# -------------------- labels & cautions --------------------
+def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str:
+    if pd.isna(x) or pd.isna(mu) or mu == 0:
         return "unknown"
     rel = (x - mu) / mu
+    if rel > band:
         return "higher than average"
+    if rel < -band:
         return "lower than average"
     return "about average"
+def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
+    return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
+def _pluralize(word: str, n: int) -> str:
+    return f"{word}{'' if n == 1 else 's'}"
+# -------------------- geo join (Top-5 only) --------------------
+def _canon(s: str) -> s_