File size: 17,188 Bytes
42a6bd6
 
5651d3e
 
42a6bd6
 
 
 
5651d3e
 
 
 
 
 
 
 
 
 
42a6bd6
5651d3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a6bd6
 
 
 
 
5651d3e
 
42a6bd6
 
 
 
5651d3e
 
 
 
 
 
 
42a6bd6
5651d3e
 
 
42a6bd6
5651d3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a6bd6
5651d3e
42a6bd6
 
5651d3e
 
42a6bd6
5651d3e
42a6bd6
 
 
5651d3e
42a6bd6
 
 
5651d3e
42a6bd6
5651d3e
 
42a6bd6
 
5651d3e
42a6bd6
5651d3e
42a6bd6
 
 
5651d3e
 
42a6bd6
5651d3e
 
42a6bd6
5651d3e
42a6bd6
1b29d16
 
42a6bd6
1b29d16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87088be
1b29d16
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# narrative_safetynet.py
from __future__ import annotations
from typing import Dict, Any, List, Optional, Tuple
import re
import math
import numpy as np
import pandas as pd

# -------------------- helpers: dtype / formatting --------------------

_DEF_MIN_SAMPLE = 5  # generic caution threshold for group sizes

_HINT_METRICS_DEFAULT = [
    "surgery_median", "consult_median",
    "surgery_90th", "consult_90th",
    "surgery", "consult",
    "wait", "median", "p90", "90th"
]

_HINT_GROUPS_DEFAULT = [
    "facility", "specialty", "zone",
    "hospital", "city", "region"
]

_BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"]

def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
    dff = df.copy()
    for c in dff.columns:
        if dff[c].dtype == "object":
            dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
    return dff

def _is_numeric_series(s: pd.Series) -> bool:
    try:
        return pd.api.types.is_numeric_dtype(s)
    except Exception:
        return False

def _to_numeric(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")

def _fmt_num(x: Any, decimals: int = 1) -> str:
    try:
        if x is None or (isinstance(x, float) and math.isnan(x)):
            return "n/a"
        if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()):
            return f"{int(round(float(x))):,}"
        return f"{float(x):,.{decimals}f}"
    except Exception:
        return str(x)

# -------------------- metric & dataset selection (dynamic) --------------------

def _score_metric_name(col: str, hints: List[str]) -> int:
    name = (col or "").lower()
    if any(bad in name for bad in _BAD_METRIC_NAMES):
        return -10**6  # disqualify obvious counters/ids
    score = 0
    for h in hints:
        if h in name:
            score += 3
    return score

def _choose_df_and_metric(
    datasets: Dict[str, Any],
    metric_hints: List[str]
) -> Optional[Tuple[str, pd.DataFrame, str]]:
    """
    Sweep all dataframes & numeric columns. Pick the (df, metric) with best score:
      +3 per hint match; +1 if non-constant numeric. Disqualify id-like names.
    """
    best: Optional[Tuple[int, str, pd.DataFrame, str]] = None
    for key, v in datasets.items():
        if not isinstance(v, pd.DataFrame) or v.empty:
            continue
        df = _nanlike_to_nan(v)
        for col in df.columns:
            col_num = _to_numeric(df[col])
            if not _is_numeric_series(col_num):
                continue
            s = _score_metric_name(col, metric_hints)
            if col_num.nunique(dropna=True) > 1:
                s += 1
            if best is None or s > best[0]:
                best = (s, key, df, col)
    if best is None:
        return None
    _, key, df, metric = best
    return key, df, metric

# -------------------- grouping detection (dynamic) --------------------

def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]:
    avoid = [a.lower() for a in (avoid or [])]
    cols = list(df.columns)
    # prefer name matches
    for cand in candidates:
        for c in cols:
            cname = c.lower()
            if cand.lower() in cname and all(a not in cname for a in avoid):
                return c
    # fallback: a categorical with reasonable cardinality
    obj_cols = [c for c in cols if df[c].dtype == "object"]
    for c in obj_cols:
        nuniq = df[c].nunique(dropna=True)
        if 1 < nuniq < max(50, len(df)//10):
            return c
    return None

# -------------------- labels & cautions --------------------

def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str:
    if pd.isna(x) or pd.isna(mu) or mu == 0:
        return "unknown"
    rel = (x - mu) / mu
    if rel > band:
        return "higher than average"
    if rel < -band:
        return "lower than average"
    return "about average"

def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
    return f"Interpret averages cautiously (only {n} records)." if n < min_n else None

def _pluralize(word: str, n: int) -> str:
    return f"{word}{'' if n == 1 else 's'}"

# -------------------- geo join (Top-5 only) --------------------

def _canon(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

def _map_top_facilities_to_odhf(
    top_facilities: pd.DataFrame,
    odhf: pd.DataFrame,
    fac_col: str = "Facility",
    odhf_name_col: str = "facility_name"
) -> pd.DataFrame:
    if odhf is None or odhf.empty or top_facilities is None or top_facilities.empty:
        return pd.DataFrame()
    out_rows: List[Dict[str, Any]] = []
    try:
        idx = { _canon(n): i for i, n in odhf[odhf_name_col].dropna().items() }
    except Exception:
        return pd.DataFrame()
    for fac in top_facilities[fac_col].dropna().astype(str).unique():
        key = _canon(fac)
        row = None
        if key in idx:
            row = odhf.loc[idx[key]]
        else:
            # contains fallback (case-insensitive)
            cand = odhf[odhf[odhf_name_col].astype(str).str.contains(fac, case=False, na=False)]
            if not cand.empty:
                row = cand.iloc[0]
        if row is not None:
            out_rows.append({
                "Facility": fac,
                "city": row.get("city"),
                "latitude": row.get("latitude"),
                "longitude": row.get("longitude")
            })
    return pd.DataFrame(out_rows)

# -------------------- main: narrative builder --------------------

def build_narrative(
    scenario_text: str,
    datasets: Dict[str, Any],
    structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
    metric_hints: Optional[List[str]] = None,
    group_hints: Optional[List[str]] = None,
    min_sample: int = _DEF_MIN_SAMPLE,
    baseline_band: float = 0.05  # Β±5% "about average"
) -> str:
    """
    Scenario-agnostic narrative fallback:
      1) Choose best (df, metric) dynamically using name hints + numeric sanity
      2) Prefer structured tables (top facilities/specialties/zones) if provided
      3) Compute overall baseline + label groups vs baseline
      4) Geo notes via fuzzy Top-5 ↔ ODHF join (<= 3 bullets)
      5) Recommendations grounded in the same metric/groups
    """

    metric_hints = (metric_hints or _HINT_METRICS_DEFAULT)
    group_hints  = (group_hints  or _HINT_GROUPS_DEFAULT)

    # ---------- 1) Pick dataset + metric ----------
    choice = _choose_df_and_metric(datasets, metric_hints)
    if not choice:
        return "No tabular data available. Unable to generate a narrative."
    df_key, df, primary_metric = choice

    # Ensure numeric
    df = _nanlike_to_nan(df)
    if primary_metric not in df.columns:
        return "Chosen metric missing. Unable to generate a narrative."
    df[primary_metric] = _to_numeric(df[primary_metric])

    # Optional comparator metric (e.g., consult vs surgery)
    comparator_metric = None
    for c in df.columns:
        if c == primary_metric:
            continue
        if _is_numeric_series(_to_numeric(df[c])):
            name = c.lower()
            if any(h in name for h in ["consult", "median", "wait", "p90", "90th"]):
                comparator_metric = c
                break

    # ---------- 2) Prefer structured tables if present ----------
    top_fac = None
    top_spec = None
    zone_tbl = None
    odhf_df = None

    if structured_tables:
        top_fac = structured_tables.get("top_facilities")
        top_spec = structured_tables.get("top_specialties")
        zone_tbl = structured_tables.get("zone_summary")
        # try to detect ODHF-like table by column fingerprint
        for k, v in datasets.items():
            if isinstance(v, pd.DataFrame) and {"facility_name", "city"}.issubset(set(map(str.lower, v.columns.str.lower()))):
                odhf_df = v
                break

    # Compute baseline from the selected df/metric (not from ODHF)
    baseline = df[primary_metric].mean(skipna=True)

    # ---------- 3) Build sections ----------

    sections: List[str] = []

    # Methodology
    meth: List[str] = []
    meth.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
    if comparator_metric:
        meth.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
    # Missing value note
    if df.isna().sum().sum() > 0:
        meth.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
    # Group hints (informative only)
    g1 = _find_group_col(df, group_hints, avoid=[primary_metric])
    if g1:
        meth.append(f"Primary grouping inferred: **{g1}**.")
    g2 = _find_group_col(df.drop(columns=[g1], errors="ignore") if g1 else df, group_hints, avoid=[primary_metric, g1 or ""])
    if g2:
        meth.append(f"Secondary grouping inferred: **{g2}**.")

    sections.append("## Methodology (Auto-generated)")
    for m in meth:
        sections.append(f"- {m}")
    sections.append("")

    # Highest averages by primary grouping (prefer structured Top-5 if given)
    top_lines: List[str] = []
    if isinstance(top_fac, pd.DataFrame) and not top_fac.empty:
        # Expect columns like: Facility, Zone, avg_Surgery_Median, count_*
        # Keep dynamic: find a metric column in top_fac aligned to primary_metric by hint matching
        metric_col = None
        for c in top_fac.columns:
            if primary_metric.lower() in c.lower() or any(h in c.lower() for h in ["avg_", "mean"]):
                if _is_numeric_series(_to_numeric(top_fac[c])):
                    metric_col = c
                    break
        if metric_col is None:
            # fallback: first numeric col
            for c in top_fac.columns:
                if _is_numeric_series(_to_numeric(top_fac[c])):
                    metric_col = c; break

        cnt_col = next((c for c in top_fac.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
        lab_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)

        if metric_col and lab_col:
            # already sorted in your executor; if not, sort desc
            tf = top_fac.copy()
            tf[metric_col] = _to_numeric(tf[metric_col])
            tf = tf.sort_values(by=metric_col, ascending=False)
            for i, row in enumerate(tf.head(5).itertuples(index=False), 1):
                label = getattr(row, lab_col)
                met   = getattr(row, metric_col)
                cnt   = getattr(row, cnt_col) if cnt_col and hasattr(row, cnt_col) else np.nan
                dev   = _label_vs_baseline(met, baseline, baseline_band)
                caution = _small_sample_note(int(cnt)) if (isinstance(cnt, (int, float)) and not pd.isna(cnt)) else None
                msg = f"{i}. **{label}** β€” {primary_metric}: {_fmt_num(met)}"
                if cnt_col and hasattr(row, cnt_col):
                    msg += f"; {_pluralize('record', int(cnt))}: {int(cnt)}"
                msg += f" β†’ {dev}"
                if caution:
                    msg += f" ({caution})"
                top_lines.append(msg)

    else:
        # No structured Top-5 provided: derive from g1
        if g1:
            tmp = df.copy()
            tmp[primary_metric] = _to_numeric(tmp[primary_metric])
            if comparator_metric in tmp.columns:
                tmp[comparator_metric] = _to_numeric(tmp[comparator_metric])
            agg = (
                tmp.groupby(g1, dropna=False)
                   .agg(metric=(primary_metric, "mean"), count=(primary_metric, "count"))
                   .reset_index()
            ).sort_values(by="metric", ascending=False)
            for i, row in enumerate(agg.head(5).itertuples(index=False), 1):
                label = getattr(row, g1)
                met   = getattr(row, "metric")
                cnt   = getattr(row, "count")
                dev   = _label_vs_baseline(met, baseline, baseline_band)
                caution = _small_sample_note(int(cnt), min_sample)
                msg = f"{i}. **{label}** β€” {primary_metric}: {_fmt_num(met)}; {_pluralize('record', int(cnt))}: {cnt} β†’ {dev}"
                if caution:
                    msg += f" ({caution})"
                top_lines.append(msg)

    if top_lines:
        sections.append("## Highest average values by group")
        sections.extend(top_lines)
        sections.append("")

    # Zone comparison (prefer structured zone table if present)
    zone_lines: List[str] = []
    if isinstance(zone_tbl, pd.DataFrame) and not zone_tbl.empty:
        z = zone_tbl.copy()
        # find zone label & metric columns dynamically
        zone_col = next((c for c in z.columns if "zone" in c.lower()), None)
        zmet_col = next((c for c in z.columns if primary_metric.lower() in c.lower() or "avg" in c.lower()), None)
        zcnt_col = next((c for c in z.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)

        if zone_col and zmet_col:
            # Clean truly missing zones but keep literal "Total" if present
            z[zone_col] = z[zone_col].astype("string")
            keep = (z[zone_col].notna()) | (z[zone_col].str.upper() == "TOTAL")
            z = z[keep]
            z[zmet_col] = _to_numeric(z[zmet_col])
            z = z.sort_values(by=zmet_col, ascending=False)

            for row in z.itertuples(index=False):
                zone = getattr(row, zone_col)
                met  = getattr(row, zmet_col)
                cnt  = getattr(row, zcnt_col) if zcnt_col and hasattr(row, zcnt_col) else np.nan
                lab  = _label_vs_baseline(met, baseline, baseline_band)
                msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β†’ {lab})"
                if zcnt_col and hasattr(row, zcnt_col) and not pd.isna(cnt):
                    msg += f"; n={int(cnt)}"
                zone_lines.append(msg)

    else:
        # Derive zones dynamically if a zone-like column exists
        zcol = _find_group_col(df, ["zone"])
        if zcol:
            z = df.copy()
            z[zcol] = z[zcol].astype("string").str.strip()
            # drop true NaN zones, but do NOT fabricate totals
            z = z[z[zcol].notna()]
            agg = (
                z.groupby(zcol, dropna=False)[primary_metric]
                 .agg(["mean", "count"]).reset_index()
                 .rename(columns={"mean": "metric", "count": "count"})
                 .sort_values(by="metric", ascending=False)
            )
            for row in agg.itertuples(index=False):
                zone = getattr(row, zcol)
                met  = getattr(row, "metric")
                cnt  = getattr(row, "count")
                lab  = _label_vs_baseline(met, baseline, baseline_band)
                msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β†’ {lab}); n={cnt}"
                zone_lines.append(msg)

    if zone_lines:
        sections.append(f"## {( 'Zone' if 'zone' in ''.join(df.columns).lower() else 'Category')} comparison vs overall")
        sections.extend(zone_lines)
        sections.append("")

    # Geographic notes β€” map Top-5 facilities only (if we have both Top-5 and ODHF df)
    geo_lines: List[str] = []
    if isinstance(top_fac, pd.DataFrame) and not top_fac.empty and isinstance(odhf_df, pd.DataFrame) and not odhf_df.empty:
        fac_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
        if fac_col:
            mapped = _map_top_facilities_to_odhf(top_fac.head(5), odhf_df, fac_col=fac_col, odhf_name_col=next(
                (c for c in odhf_df.columns if c.lower() == "facility_name"), "facility_name"
            ))
            if not mapped.empty:
                for r in mapped.head(3).to_dict(orient="records"):
                    f = r.get("Facility")
                    city = r.get("city")
                    geo_lines.append(f"- **{f}** ({city}) is among the highest-average groups; consider capacity and referral patterns.")
    if geo_lines:
        sections.append("## Geographic notes")
        sections.extend(geo_lines)
        sections.append("")

    # Recommendations β€” grounded in the above
    recs: List[str] = []
    if top_lines:
        recs.append("Prioritize operating room time and staffing for the highest-average groups, especially those with substantial volume.")
    if comparator_metric:
        recs.append(f"Track **{comparator_metric}** alongside {primary_metric} to identify upstream bottlenecks (e.g., long consult waits driving surgical delays).")
    if zone_lines:
        recs.append("Address zones persistently above the provincial baseline; deploy targeted resources and load balancing across facilities.")
    recs.append("Apply small-sample caution; pool or validate categories with very few records before acting on outliers.")
    recs.append("Standardize specialty/facility naming to reduce coding-induced variance in aggregates.")

    sections.append("## Recommendations (Auto-generated)")
    for r in recs:
        sections.append(f"- {r}")

    return "\n".join(sections).strip()