Medica_DecisionSupportAI / narrative_safetynet.py
Rajan Sharma
Update narrative_safetynet.py
5651d3e verified
raw
history blame
4.3 kB
# narrative_safetynet.py
from __future__ import annotations
from typing import Dict, Any, List, Optional, Tuple
import re
import math
import numpy as np
import pandas as pd
# -------------------- helpers: dtype / formatting --------------------
_DEF_MIN_SAMPLE = 5 # generic caution threshold for group sizes
_HINT_METRICS_DEFAULT = [
"surgery_median", "consult_median",
"surgery_90th", "consult_90th",
"surgery", "consult",
"wait", "median", "p90", "90th"
]
_HINT_GROUPS_DEFAULT = [
"facility", "specialty", "zone",
"hospital", "city", "region"
]
_BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"]
def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
dff = df.copy()
for c in dff.columns:
if dff[c].dtype == "object":
dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
return dff
def _is_numeric_series(s: pd.Series) -> bool:
try:
return pd.api.types.is_numeric_dtype(s)
except Exception:
return False
def _to_numeric(s: pd.Series) -> pd.Series:
return pd.to_numeric(s, errors="coerce")
def _fmt_num(x: Any, decimals: int = 1) -> str:
try:
if x is None or (isinstance(x, float) and math.isnan(x)):
return "n/a"
if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()):
return f"{int(round(float(x))):,}"
return f"{float(x):,.{decimals}f}"
except Exception:
return str(x)
# -------------------- metric & dataset selection (dynamic) --------------------
def _score_metric_name(col: str, hints: List[str]) -> int:
name = (col or "").lower()
if any(bad in name for bad in _BAD_METRIC_NAMES):
return -10**6 # disqualify obvious counters/ids
score = 0
for h in hints:
if h in name:
score += 3
return score
def _choose_df_and_metric(
datasets: Dict[str, Any],
metric_hints: List[str]
) -> Optional[Tuple[str, pd.DataFrame, str]]:
"""
Sweep all dataframes & numeric columns. Pick the (df, metric) with best score:
+3 per hint match; +1 if non-constant numeric. Disqualify id-like names.
"""
best: Optional[Tuple[int, str, pd.DataFrame, str]] = None
for key, v in datasets.items():
if not isinstance(v, pd.DataFrame) or v.empty:
continue
df = _nanlike_to_nan(v)
for col in df.columns:
col_num = _to_numeric(df[col])
if not _is_numeric_series(col_num):
continue
s = _score_metric_name(col, metric_hints)
if col_num.nunique(dropna=True) > 1:
s += 1
if best is None or s > best[0]:
best = (s, key, df, col)
if best is None:
return None
_, key, df, metric = best
return key, df, metric
# -------------------- grouping detection (dynamic) --------------------
def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]:
avoid = [a.lower() for a in (avoid or [])]
cols = list(df.columns)
# prefer name matches
for cand in candidates:
for c in cols:
cname = c.lower()
if cand.lower() in cname and all(a not in cname for a in avoid):
return c
# fallback: a categorical with reasonable cardinality
obj_cols = [c for c in cols if df[c].dtype == "object"]
for c in obj_cols:
nuniq = df[c].nunique(dropna=True)
if 1 < nuniq < max(50, len(df)//10):
return c
return None
# -------------------- labels & cautions --------------------
def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str:
if pd.isna(x) or pd.isna(mu) or mu == 0:
return "unknown"
rel = (x - mu) / mu
if rel > band:
return "higher than average"
if rel < -band:
return "lower than average"
return "about average"
def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
def _pluralize(word: str, n: int) -> str:
return f"{word}{'' if n == 1 else 's'}"
# -------------------- geo join (Top-5 only) --------------------
def _canon(s: str) -> s_