Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 28

Commit

87088be

verified ·

1 Parent(s): 2fccbc6

Update narrative_safetynet.py

Browse files

Files changed (1) hide show

narrative_safetynet.py +63 -77

narrative_safetynet.py CHANGED Viewed

@@ -1,12 +1,11 @@
 # narrative_safetynet.py
 from __future__ import annotations
-from typing import Dict, Any, List, Optional, Tuple
 import math
 import numpy as np
 import pandas as pd
 import re
-# ---------- Generic helpers ----------
 _DEF_MIN_SAMPLE = 5  # threshold for "interpret with caution" (fully generic)
 def _is_numeric(s: pd.Series) -> bool:
@@ -23,7 +22,7 @@ def _fmt_num(x: Any, decimals: int = 1) -> str:
         return str(x)
 def _pick_numeric(df: pd.DataFrame, hints: List[str]) -> Optional[str]:
-    # choose a numeric column; use hints like "Surgery_Median", "Consult_Median" if present
     cols = list(df.columns)
     for h in hints:
         for c in cols:
@@ -35,22 +34,20 @@ def _pick_numeric(df: pd.DataFrame, hints: List[str]) -> Optional[str]:
     return None
 def _find_group_col(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
-    # choose a categorical/grouping column by fuzzy name
     cols = list(df.columns)
     for cand in candidates:
         for c in cols:
             if cand.lower() in c.lower():
                 return c
-    # fallback: first object/string column with reasonable cardinality
     obj_cols = [c for c in cols if df[c].dtype == "object"]
     for c in obj_cols:
         nuniq = df[c].nunique(dropna=True)
-        if 1 < nuniq < max(50, len(df) // 10):  # avoid IDs (too high cardinality) and constants
             return c
     return None
 def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
-    # treat dashes and blank strings as NaN; do not hard-code schema
     dff = df.copy()
     for c in dff.columns:
         if dff[c].dtype == "object":
@@ -61,15 +58,12 @@ def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
     return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
 def _deviation_label(x: float, mu: float, tol: float = 0.01) -> str:
-    # tol is a fraction of mu for ≈ equal bucket (1% default)
-    if np.isnan(x) or np.isnan(mu):
-        return "unknown"
-    if mu == 0:
         return "unknown"
     rel = (x - mu) / mu
-    if rel > 0.05:  # > +5% above average
         return "higher than average"
-    if rel < -0.05: # < -5% below average
         return "lower than average"
     if abs(rel) <= max(tol, 0.05):
         return "about average"
@@ -78,31 +72,25 @@ def _deviation_label(x: float, mu: float, tol: float = 0.01) -> str:
 def _pluralize(label: str, n: int) -> str:
     return f"{label}{'' if n==1 else 's'}"
-# ---------- Core narrative generator ----------
 def build_narrative(
     scenario_text: str,
-    # A dict of dataframes your engine just produced / loaded (e.g., the same dict passed to ScenarioEngine)
     datasets: Dict[str, Any],
-    # Optional structured outputs your engine already rendered (tables etc.) if you want to cross-reference
     structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
-    # Hints for metric selection (keeps it scenario-agnostic)
-    metric_hints: Optional[List[str]] = None,    # e.g. ["Surgery_Median", "Consult_Median", "Wait"]
-    group_hints: Optional[List[str]] = None,     # e.g. ["Facility","Specialty","Zone"]
     min_sample: int = _DEF_MIN_SAMPLE
 ) -> str:
     """
-    Returns a markdown narrative that:
-      - Summarizes methodology (cleaning, numeric detection)
-      - Highlights top groups by the chosen metric
-      - Computes an overall baseline and compares groups vs baseline
-      - Flags small-sample groups
-      - Adds geographic notes if city/lat/lon are present (fully optional)
-    This function avoids any scenario-specific strings and infers columns dynamically.
     """
-    metric_hints = metric_hints or ["surgery_median", "consult_median", "wait", "median", "p50"]
     group_hints  = group_hints  or ["facility", "specialty", "zone", "hospital", "city", "region"]
-    # 1) Pick a primary dataset (first table-like) and sanitize
     df = None
     df_key = None
     for k, v in datasets.items():
@@ -113,50 +101,50 @@ def build_narrative(
     if df is None:
         return "No tabular data available. Unable to generate a narrative."
-    # 2) Pick primary metric (numeric) and up to two comparators (e.g., consult vs surgery)
     primary_metric = _pick_numeric(df, metric_hints)   # e.g., Surgery_Median
     if not primary_metric:
         return "No numeric metric found to summarize; please ensure at least one numeric wait-time column is present."
     other_numeric = [c for c in df.columns if _is_numeric(df[c]) and c != primary_metric]
-    comparator_metric = next((c for c in other_numeric if any(h in c.lower() for h in ["consult", "wait", "median", "p90", "90th"])), None)
-    # 3) Choose groupings dynamically
     group1 = _find_group_col(df, group_hints)  # e.g., Facility
     group2 = None
     if group1:
-        # try to find a second group that isn't identical (e.g., Zone if Facility selected)
         alt_hints = [h for h in group_hints if h.lower() not in group1.lower()]
         group2 = _find_group_col(df.drop(columns=[group1], errors="ignore"), alt_hints)
-    # 4) Baseline (overall) and grouped stats
-    baseline = df[primary_metric].astype(float).mean(skipna=True)
-    # grouped (group1)
-    g1 = None
-    if group1:
-        g1 = (
-            df.groupby(group1, dropna=False)
-              .agg(
-                  metric=(primary_metric, "mean"),
-                  count=(primary_metric, "count"),
-                  _comp=(comparator_metric, "mean") if comparator_metric else (primary_metric, "mean"),
-              )
-              .reset_index()
-        )
-    # grouped (group2)
-    g2 = None
-    if group2:
-        g2 = (
-            df.groupby(group2, dropna=False)
-              .agg(
-                  metric=(primary_metric, "mean"),
-                  count=(primary_metric, "count"),
-                  _comp=(comparator_metric, "mean") if comparator_metric else (primary_metric, "mean"),
-              )
-              .reset_index()
         )
-    # 5) Identify top/bottom (by deviation) for group1
     top_lines: List[str] = []
     if isinstance(g1, pd.DataFrame) and not g1.empty:
         g1 = g1.sort_values(by="metric", ascending=False)
@@ -164,7 +152,7 @@ def build_narrative(
         for i, row in enumerate(g1.head(k).itertuples(index=False), 1):
             label = getattr(row, group1)
             metric = getattr(row, "metric")
-            comp   = getattr(row, "_comp")
             cnt    = getattr(row, "count")
             devlab = _deviation_label(metric, baseline)
             caution = _small_sample_note(int(cnt), min_sample)
@@ -177,15 +165,14 @@ def build_narrative(
                 msg += f" ({caution})"
             top_lines.append(msg)
-    # 6) Zone/region style overview (group2)
     region_lines: List[str] = []
     if isinstance(g2, pd.DataFrame) and not g2.empty:
-        # order by metric descending
         g2 = g2.sort_values(by="metric", ascending=False)
         for row in g2.itertuples(index=False):
             label = getattr(row, group2)
             metric = getattr(row, "metric")
-            comp   = getattr(row, "_comp")
             cnt    = getattr(row, "count")
             devlab = _deviation_label(metric, baseline)
             caution = _small_sample_note(int(cnt), min_sample)
@@ -196,34 +183,33 @@ def build_narrative(
                 line += f" — {caution}"
             region_lines.append(line)
-    # 7) Geographic notes (if present)
-    # We never hard-code field names; we look for city/lat/lon patterns
     geo_notes: List[str] = []
     city_col = next((c for c in df.columns if re.search(r"\bcity\b", c, re.I)), None)
     lat_col  = next((c for c in df.columns if re.search(r"\b(lat|latitude)\b", c, re.I)), None)
     lon_col  = next((c for c in df.columns if re.search(r"\b(lon|longitude)\b", c, re.I)), None)
     if group1 and city_col and (lat_col and lon_col):
-        # summarize whether top groups cluster in specific cities
         if isinstance(g1, pd.DataFrame) and not g1.empty and group1 in df.columns:
-            # join back to get city data for topK
-            top_labels = [re.sub(r"\s+", " ", re.sub(r"^\s+|\s+$", "", re.sub(r"\n", " ", l))) for l in g1[group1].astype(str).head(10).tolist()]
-            sub = df[df[group1].astype(str).isin(top_labels)]
             if not sub.empty:
-                by_city = sub.groupby(city_col, dropna=False)[primary_metric].mean().reset_index().sort_values(by=primary_metric, ascending=False)
-                # Only a brief, dynamic note (no hard-coded cities)
-                top_city_rows = by_city.head(3).to_dict(orient="records")
-                for r in top_city_rows:
                     cname = r.get(city_col)
                     val = r.get(primary_metric)
                     geo_notes.append(f"- **{cname}** shows higher average {primary_metric} among top groups ({_fmt_num(val)}).")
-    # 8) Methodology (derived from actual data conditions)
     methodology: List[str] = []
-    # missing values
     na_counts = df.isna().sum().sum()
     if na_counts > 0:
         methodology.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
-    # numeric coercion note
     methodology.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
     if comparator_metric:
         methodology.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
@@ -256,7 +242,6 @@ def build_narrative(
         lines.extend(geo_notes)
         lines.append("")
-    # Generic recommendations template (data-driven, not hard-coded)
     recs: List[str] = []
     if top_lines:
         recs.append("Prioritize resources to the highest-average groups (above overall baseline), especially those with sufficient volume.")
@@ -265,7 +250,7 @@ def build_narrative(
     if isinstance(g2, pd.DataFrame) and not g2.empty:
         high = g2[g2["metric"] > baseline]
         if not high.empty:
-            recs.append(f"Address regional disparities where average **{primary_metric}** exceeds the overall baseline.")
     recs.append("For very small groups, validate data quality and consider pooling across similar categories to stabilize estimates.")
     recs.append("Validate coding differences (similar specialties or labels spelled differently) to ensure apples-to-apples comparison.")
@@ -274,3 +259,4 @@ def build_narrative(
         lines.append(f"- {r}")
     return "\n".join(lines).strip()

 # narrative_safetynet.py
 from __future__ import annotations
+from typing import Dict, Any, List, Optional
 import math
 import numpy as np
 import pandas as pd
 import re
 _DEF_MIN_SAMPLE = 5  # threshold for "interpret with caution" (fully generic)
 def _is_numeric(s: pd.Series) -> bool:
         return str(x)
 def _pick_numeric(df: pd.DataFrame, hints: List[str]) -> Optional[str]:
+    # choose a numeric column; prefer hinted names
     cols = list(df.columns)
     for h in hints:
         for c in cols:
     return None
 def _find_group_col(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
     cols = list(df.columns)
     for cand in candidates:
         for c in cols:
             if cand.lower() in c.lower():
                 return c
+    # fallback: first reasonable categorical column
     obj_cols = [c for c in cols if df[c].dtype == "object"]
     for c in obj_cols:
         nuniq = df[c].nunique(dropna=True)
+        if 1 < nuniq < max(50, len(df) // 10):
             return c
     return None
 def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
     dff = df.copy()
     for c in dff.columns:
         if dff[c].dtype == "object":
     return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
 def _deviation_label(x: float, mu: float, tol: float = 0.01) -> str:
+    if np.isnan(x) or np.isnan(mu) or mu == 0:
         return "unknown"
     rel = (x - mu) / mu
+    if rel > 0.05:
         return "higher than average"
+    if rel < -0.05:
         return "lower than average"
     if abs(rel) <= max(tol, 0.05):
         return "about average"
 def _pluralize(label: str, n: int) -> str:
     return f"{label}{'' if n==1 else 's'}"
 def build_narrative(
     scenario_text: str,
     datasets: Dict[str, Any],
     structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
+    metric_hints: Optional[List[str]] = None,
+    group_hints: Optional[List[str]] = None,
     min_sample: int = _DEF_MIN_SAMPLE
 ) -> str:
     """
+    Scenario-agnostic narrative fallback:
+      - Picks numeric metric & groupings dynamically
+      - Computes overall baseline + deviations
+      - Warns on small samples
+      - Optional geographic notes if city/lat/lon exist
     """
+    metric_hints = metric_hints or ["surgery_median", "consult_median", "wait", "median", "p90", "90th"]
     group_hints  = group_hints  or ["facility", "specialty", "zone", "hospital", "city", "region"]
+    # 1) choose first non-empty table-like dataset
     df = None
     df_key = None
     for k, v in datasets.items():
     if df is None:
         return "No tabular data available. Unable to generate a narrative."
+    # 2) metrics
     primary_metric = _pick_numeric(df, metric_hints)   # e.g., Surgery_Median
     if not primary_metric:
         return "No numeric metric found to summarize; please ensure at least one numeric wait-time column is present."
     other_numeric = [c for c in df.columns if _is_numeric(df[c]) and c != primary_metric]
+    comparator_metric = next(
+        (c for c in other_numeric if any(h in c.lower() for h in ["consult", "wait", "median", "p90", "90th"])),
+        None
+    )
+    # 3) groups
     group1 = _find_group_col(df, group_hints)  # e.g., Facility
     group2 = None
     if group1:
         alt_hints = [h for h in group_hints if h.lower() not in group1.lower()]
         group2 = _find_group_col(df.drop(columns=[group1], errors="ignore"), alt_hints)
+    # 4) baseline + grouped
+    baseline = pd.to_numeric(df[primary_metric], errors="coerce").mean(skipna=True)
+    def _group_stats(col: str) -> Optional[pd.DataFrame]:
+        if not col:
+            return None
+        tmp = df.copy()
+        tmp[primary_metric] = pd.to_numeric(tmp[primary_metric], errors="coerce")
+        comp_col = comparator_metric or primary_metric
+        if comp_col in tmp.columns:
+            tmp[comp_col] = pd.to_numeric(tmp[comp_col], errors="coerce")
+        agg = (
+            tmp.groupby(col, dropna=False)
+               .agg(
+                   metric=(primary_metric, "mean"),
+                   count=(primary_metric, "count"),
+                   comp=(comp_col, "mean") if comp_col in tmp.columns else (primary_metric, "mean"),
+               )
+               .reset_index()
         )
+        return agg
+    g1 = _group_stats(group1)
+    g2 = _group_stats(group2)
+    # 5) Top groups (by primary metric) from group1
     top_lines: List[str] = []
     if isinstance(g1, pd.DataFrame) and not g1.empty:
         g1 = g1.sort_values(by="metric", ascending=False)
         for i, row in enumerate(g1.head(k).itertuples(index=False), 1):
             label = getattr(row, group1)
             metric = getattr(row, "metric")
+            comp   = getattr(row, "comp")
             cnt    = getattr(row, "count")
             devlab = _deviation_label(metric, baseline)
             caution = _small_sample_note(int(cnt), min_sample)
                 msg += f" ({caution})"
             top_lines.append(msg)
+    # 6) Group2 overview
     region_lines: List[str] = []
     if isinstance(g2, pd.DataFrame) and not g2.empty:
         g2 = g2.sort_values(by="metric", ascending=False)
         for row in g2.itertuples(index=False):
             label = getattr(row, group2)
             metric = getattr(row, "metric")
+            comp   = getattr(row, "comp")
             cnt    = getattr(row, "count")
             devlab = _deviation_label(metric, baseline)
             caution = _small_sample_note(int(cnt), min_sample)
                 line += f" — {caution}"
             region_lines.append(line)
+    # 7) Geographic notes (optional)
     geo_notes: List[str] = []
     city_col = next((c for c in df.columns if re.search(r"\bcity\b", c, re.I)), None)
     lat_col  = next((c for c in df.columns if re.search(r"\b(lat|latitude)\b", c, re.I)), None)
     lon_col  = next((c for c in df.columns if re.search(r"\b(lon|longitude)\b", c, re.I)), None)
     if group1 and city_col and (lat_col and lon_col):
         if isinstance(g1, pd.DataFrame) and not g1.empty and group1 in df.columns:
+            top_labels = g1[group1].astype(str).head(10).tolist()
+            sub = df[df[group1].astype(str).isin(top_labels)].copy()
             if not sub.empty:
+                sub[primary_metric] = pd.to_numeric(sub[primary_metric], errors="coerce")
+                by_city = (
+                    sub.groupby(city_col, dropna=False)[primary_metric]
+                       .mean()
+                       .reset_index()
+                       .sort_values(by=primary_metric, ascending=False)
+                )
+                for r in by_city.head(3).to_dict(orient="records"):
                     cname = r.get(city_col)
                     val = r.get(primary_metric)
                     geo_notes.append(f"- **{cname}** shows higher average {primary_metric} among top groups ({_fmt_num(val)}).")
+    # 8) Methodology (auto)
     methodology: List[str] = []
     na_counts = df.isna().sum().sum()
     if na_counts > 0:
         methodology.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
     methodology.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
     if comparator_metric:
         methodology.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
         lines.extend(geo_notes)
         lines.append("")
     recs: List[str] = []
     if top_lines:
         recs.append("Prioritize resources to the highest-average groups (above overall baseline), especially those with sufficient volume.")
     if isinstance(g2, pd.DataFrame) and not g2.empty:
         high = g2[g2["metric"] > baseline]
         if not high.empty:
+            recs.append(f"Address disparities where average **{primary_metric}** exceeds the overall baseline.")
     recs.append("For very small groups, validate data quality and consider pooling across similar categories to stabilize estimates.")
     recs.append("Validate coding differences (similar specialties or labels spelled differently) to ensure apples-to-apples comparison.")
         lines.append(f"- {r}")
     return "\n".join(lines).strip()