Spaces:
Sleeping
Sleeping
| # narrative_safetynet.py | |
| from __future__ import annotations | |
| from typing import Dict, Any, List, Optional, Tuple | |
| import re | |
| import math | |
| import numpy as np | |
| import pandas as pd | |
| # -------------------- helpers: dtype / formatting -------------------- | |
| _DEF_MIN_SAMPLE = 5 # generic caution threshold for group sizes | |
| _HINT_METRICS_DEFAULT = [ | |
| "surgery_median", "consult_median", | |
| "surgery_90th", "consult_90th", | |
| "surgery", "consult", | |
| "wait", "median", "p90", "90th" | |
| ] | |
| _HINT_GROUPS_DEFAULT = [ | |
| "facility", "specialty", "zone", | |
| "hospital", "city", "region" | |
| ] | |
| _BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"] | |
| def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame: | |
| dff = df.copy() | |
| for c in dff.columns: | |
| if dff[c].dtype == "object": | |
| dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-ββ]$": np.nan}, regex=True) | |
| return dff | |
| def _is_numeric_series(s: pd.Series) -> bool: | |
| try: | |
| return pd.api.types.is_numeric_dtype(s) | |
| except Exception: | |
| return False | |
| def _to_numeric(s: pd.Series) -> pd.Series: | |
| return pd.to_numeric(s, errors="coerce") | |
| def _fmt_num(x: Any, decimals: int = 1) -> str: | |
| try: | |
| if x is None or (isinstance(x, float) and math.isnan(x)): | |
| return "n/a" | |
| if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()): | |
| return f"{int(round(float(x))):,}" | |
| return f"{float(x):,.{decimals}f}" | |
| except Exception: | |
| return str(x) | |
| # -------------------- metric & dataset selection (dynamic) -------------------- | |
| def _score_metric_name(col: str, hints: List[str]) -> int: | |
| name = (col or "").lower() | |
| if any(bad in name for bad in _BAD_METRIC_NAMES): | |
| return -10**6 # disqualify obvious counters/ids | |
| score = 0 | |
| for h in hints: | |
| if h in name: | |
| score += 3 | |
| return score | |
| def _choose_df_and_metric( | |
| datasets: Dict[str, Any], | |
| metric_hints: List[str] | |
| ) -> Optional[Tuple[str, pd.DataFrame, str]]: | |
| """ | |
| Sweep all dataframes & numeric columns. Pick the (df, metric) with best score: | |
| +3 per hint match; +1 if non-constant numeric. Disqualify id-like names. | |
| """ | |
| best: Optional[Tuple[int, str, pd.DataFrame, str]] = None | |
| for key, v in datasets.items(): | |
| if not isinstance(v, pd.DataFrame) or v.empty: | |
| continue | |
| df = _nanlike_to_nan(v) | |
| for col in df.columns: | |
| col_num = _to_numeric(df[col]) | |
| if not _is_numeric_series(col_num): | |
| continue | |
| s = _score_metric_name(col, metric_hints) | |
| if col_num.nunique(dropna=True) > 1: | |
| s += 1 | |
| if best is None or s > best[0]: | |
| best = (s, key, df, col) | |
| if best is None: | |
| return None | |
| _, key, df, metric = best | |
| return key, df, metric | |
| # -------------------- grouping detection (dynamic) -------------------- | |
| def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]: | |
| avoid = [a.lower() for a in (avoid or [])] | |
| cols = list(df.columns) | |
| # prefer name matches | |
| for cand in candidates: | |
| for c in cols: | |
| cname = c.lower() | |
| if cand.lower() in cname and all(a not in cname for a in avoid): | |
| return c | |
| # fallback: a categorical with reasonable cardinality | |
| obj_cols = [c for c in cols if df[c].dtype == "object"] | |
| for c in obj_cols: | |
| nuniq = df[c].nunique(dropna=True) | |
| if 1 < nuniq < max(50, len(df)//10): | |
| return c | |
| return None | |
| # -------------------- labels & cautions -------------------- | |
| def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str: | |
| if pd.isna(x) or pd.isna(mu) or mu == 0: | |
| return "unknown" | |
| rel = (x - mu) / mu | |
| if rel > band: | |
| return "higher than average" | |
| if rel < -band: | |
| return "lower than average" | |
| return "about average" | |
| def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]: | |
| return f"Interpret averages cautiously (only {n} records)." if n < min_n else None | |
| def _pluralize(word: str, n: int) -> str: | |
| return f"{word}{'' if n == 1 else 's'}" | |
| # -------------------- geo join (Top-5 only) -------------------- | |
| def _canon(s: str) -> s_ | |