Spaces:
Sleeping
Sleeping
Rajan Sharma
commited on
Update narrative_safetynet.py
Browse files- narrative_safetynet.py +278 -1
narrative_safetynet.py
CHANGED
|
@@ -128,6 +128,283 @@ def _pluralize(word: str, n: int) -> str:
|
|
| 128 |
|
| 129 |
# -------------------- geo join (Top-5 only) --------------------
|
| 130 |
|
| 131 |
-
def _canon(s: str) ->
|
|
|
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
|
|
|
|
|
| 128 |
|
| 129 |
# -------------------- geo join (Top-5 only) --------------------
|
| 130 |
|
| 131 |
+
def _canon(s: str) -> str:
|
| 132 |
+
return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
|
| 133 |
|
| 134 |
+
def _map_top_facilities_to_odhf(
|
| 135 |
+
top_facilities: pd.DataFrame,
|
| 136 |
+
odhf: pd.DataFrame,
|
| 137 |
+
fac_col: str = "Facility",
|
| 138 |
+
odhf_name_col: str = "facility_name"
|
| 139 |
+
) -> pd.DataFrame:
|
| 140 |
+
if odhf is None or odhf.empty or top_facilities is None or top_facilities.empty:
|
| 141 |
+
return pd.DataFrame()
|
| 142 |
+
out_rows: List[Dict[str, Any]] = []
|
| 143 |
+
try:
|
| 144 |
+
idx = { _canon(n): i for i, n in odhf[odhf_name_col].dropna().items() }
|
| 145 |
+
except Exception:
|
| 146 |
+
return pd.DataFrame()
|
| 147 |
+
for fac in top_facilities[fac_col].dropna().astype(str).unique():
|
| 148 |
+
key = _canon(fac)
|
| 149 |
+
row = None
|
| 150 |
+
if key in idx:
|
| 151 |
+
row = odhf.loc[idx[key]]
|
| 152 |
+
else:
|
| 153 |
+
# contains fallback (case-insensitive)
|
| 154 |
+
cand = odhf[odhf[odhf_name_col].astype(str).str.contains(fac, case=False, na=False)]
|
| 155 |
+
if not cand.empty:
|
| 156 |
+
row = cand.iloc[0]
|
| 157 |
+
if row is not None:
|
| 158 |
+
out_rows.append({
|
| 159 |
+
"Facility": fac,
|
| 160 |
+
"city": row.get("city"),
|
| 161 |
+
"latitude": row.get("latitude"),
|
| 162 |
+
"longitude": row.get("longitude")
|
| 163 |
+
})
|
| 164 |
+
return pd.DataFrame(out_rows)
|
| 165 |
+
|
| 166 |
+
# -------------------- main: narrative builder --------------------
|
| 167 |
+
|
| 168 |
+
def build_narrative(
|
| 169 |
+
scenario_text: str,
|
| 170 |
+
datasets: Dict[str, Any],
|
| 171 |
+
structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
|
| 172 |
+
metric_hints: Optional[List[str]] = None,
|
| 173 |
+
group_hints: Optional[List[str]] = None,
|
| 174 |
+
min_sample: int = _DEF_MIN_SAMPLE,
|
| 175 |
+
baseline_band: float = 0.05 # Β±5% "about average"
|
| 176 |
+
) -> str:
|
| 177 |
+
"""
|
| 178 |
+
Scenario-agnostic narrative fallback:
|
| 179 |
+
1) Choose best (df, metric) dynamically using name hints + numeric sanity
|
| 180 |
+
2) Prefer structured tables (top facilities/specialties/zones) if provided
|
| 181 |
+
3) Compute overall baseline + label groups vs baseline
|
| 182 |
+
4) Geo notes via fuzzy Top-5 β ODHF join (<= 3 bullets)
|
| 183 |
+
5) Recommendations grounded in the same metric/groups
|
| 184 |
+
"""
|
| 185 |
+
|
| 186 |
+
metric_hints = (metric_hints or _HINT_METRICS_DEFAULT)
|
| 187 |
+
group_hints = (group_hints or _HINT_GROUPS_DEFAULT)
|
| 188 |
+
|
| 189 |
+
# ---------- 1) Pick dataset + metric ----------
|
| 190 |
+
choice = _choose_df_and_metric(datasets, metric_hints)
|
| 191 |
+
if not choice:
|
| 192 |
+
return "No tabular data available. Unable to generate a narrative."
|
| 193 |
+
df_key, df, primary_metric = choice
|
| 194 |
+
|
| 195 |
+
# Ensure numeric
|
| 196 |
+
df = _nanlike_to_nan(df)
|
| 197 |
+
if primary_metric not in df.columns:
|
| 198 |
+
return "Chosen metric missing. Unable to generate a narrative."
|
| 199 |
+
df[primary_metric] = _to_numeric(df[primary_metric])
|
| 200 |
+
|
| 201 |
+
# Optional comparator metric (e.g., consult vs surgery)
|
| 202 |
+
comparator_metric = None
|
| 203 |
+
for c in df.columns:
|
| 204 |
+
if c == primary_metric:
|
| 205 |
+
continue
|
| 206 |
+
if _is_numeric_series(_to_numeric(df[c])):
|
| 207 |
+
name = c.lower()
|
| 208 |
+
if any(h in name for h in ["consult", "median", "wait", "p90", "90th"]):
|
| 209 |
+
comparator_metric = c
|
| 210 |
+
break
|
| 211 |
+
|
| 212 |
+
# ---------- 2) Prefer structured tables if present ----------
|
| 213 |
+
top_fac = None
|
| 214 |
+
top_spec = None
|
| 215 |
+
zone_tbl = None
|
| 216 |
+
odhf_df = None
|
| 217 |
+
|
| 218 |
+
if structured_tables:
|
| 219 |
+
top_fac = structured_tables.get("top_facilities")
|
| 220 |
+
top_spec = structured_tables.get("top_specialties")
|
| 221 |
+
zone_tbl = structured_tables.get("zone_summary")
|
| 222 |
+
# try to detect ODHF-like table by column fingerprint
|
| 223 |
+
for k, v in datasets.items():
|
| 224 |
+
if isinstance(v, pd.DataFrame) and {"facility_name", "city"}.issubset(set(map(str.lower, v.columns.str.lower()))):
|
| 225 |
+
odhf_df = v
|
| 226 |
+
break
|
| 227 |
+
|
| 228 |
+
# Compute baseline from the selected df/metric (not from ODHF)
|
| 229 |
+
baseline = df[primary_metric].mean(skipna=True)
|
| 230 |
+
|
| 231 |
+
# ---------- 3) Build sections ----------
|
| 232 |
+
|
| 233 |
+
sections: List[str] = []
|
| 234 |
+
|
| 235 |
+
# Methodology
|
| 236 |
+
meth: List[str] = []
|
| 237 |
+
meth.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
|
| 238 |
+
if comparator_metric:
|
| 239 |
+
meth.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
|
| 240 |
+
# Missing value note
|
| 241 |
+
if df.isna().sum().sum() > 0:
|
| 242 |
+
meth.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
|
| 243 |
+
# Group hints (informative only)
|
| 244 |
+
g1 = _find_group_col(df, group_hints, avoid=[primary_metric])
|
| 245 |
+
if g1:
|
| 246 |
+
meth.append(f"Primary grouping inferred: **{g1}**.")
|
| 247 |
+
g2 = _find_group_col(df.drop(columns=[g1], errors="ignore") if g1 else df, group_hints, avoid=[primary_metric, g1 or ""])
|
| 248 |
+
if g2:
|
| 249 |
+
meth.append(f"Secondary grouping inferred: **{g2}**.")
|
| 250 |
+
|
| 251 |
+
sections.append("## Methodology (Auto-generated)")
|
| 252 |
+
for m in meth:
|
| 253 |
+
sections.append(f"- {m}")
|
| 254 |
+
sections.append("")
|
| 255 |
+
|
| 256 |
+
# Highest averages by primary grouping (prefer structured Top-5 if given)
|
| 257 |
+
top_lines: List[str] = []
|
| 258 |
+
if isinstance(top_fac, pd.DataFrame) and not top_fac.empty:
|
| 259 |
+
# Expect columns like: Facility, Zone, avg_Surgery_Median, count_*
|
| 260 |
+
# Keep dynamic: find a metric column in top_fac aligned to primary_metric by hint matching
|
| 261 |
+
metric_col = None
|
| 262 |
+
for c in top_fac.columns:
|
| 263 |
+
if primary_metric.lower() in c.lower() or any(h in c.lower() for h in ["avg_", "mean"]):
|
| 264 |
+
if _is_numeric_series(_to_numeric(top_fac[c])):
|
| 265 |
+
metric_col = c
|
| 266 |
+
break
|
| 267 |
+
if metric_col is None:
|
| 268 |
+
# fallback: first numeric col
|
| 269 |
+
for c in top_fac.columns:
|
| 270 |
+
if _is_numeric_series(_to_numeric(top_fac[c])):
|
| 271 |
+
metric_col = c; break
|
| 272 |
+
|
| 273 |
+
cnt_col = next((c for c in top_fac.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
|
| 274 |
+
lab_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
|
| 275 |
+
|
| 276 |
+
if metric_col and lab_col:
|
| 277 |
+
# already sorted in your executor; if not, sort desc
|
| 278 |
+
tf = top_fac.copy()
|
| 279 |
+
tf[metric_col] = _to_numeric(tf[metric_col])
|
| 280 |
+
tf = tf.sort_values(by=metric_col, ascending=False)
|
| 281 |
+
for i, row in enumerate(tf.head(5).itertuples(index=False), 1):
|
| 282 |
+
label = getattr(row, lab_col)
|
| 283 |
+
met = getattr(row, metric_col)
|
| 284 |
+
cnt = getattr(row, cnt_col) if cnt_col and hasattr(row, cnt_col) else np.nan
|
| 285 |
+
dev = _label_vs_baseline(met, baseline, baseline_band)
|
| 286 |
+
caution = _small_sample_note(int(cnt)) if (isinstance(cnt, (int, float)) and not pd.isna(cnt)) else None
|
| 287 |
+
msg = f"{i}. **{label}** β {primary_metric}: {_fmt_num(met)}"
|
| 288 |
+
if cnt_col and hasattr(row, cnt_col):
|
| 289 |
+
msg += f"; {_pluralize('record', int(cnt))}: {int(cnt)}"
|
| 290 |
+
msg += f" β {dev}"
|
| 291 |
+
if caution:
|
| 292 |
+
msg += f" ({caution})"
|
| 293 |
+
top_lines.append(msg)
|
| 294 |
+
|
| 295 |
+
else:
|
| 296 |
+
# No structured Top-5 provided: derive from g1
|
| 297 |
+
if g1:
|
| 298 |
+
tmp = df.copy()
|
| 299 |
+
tmp[primary_metric] = _to_numeric(tmp[primary_metric])
|
| 300 |
+
if comparator_metric in tmp.columns:
|
| 301 |
+
tmp[comparator_metric] = _to_numeric(tmp[comparator_metric])
|
| 302 |
+
agg = (
|
| 303 |
+
tmp.groupby(g1, dropna=False)
|
| 304 |
+
.agg(metric=(primary_metric, "mean"), count=(primary_metric, "count"))
|
| 305 |
+
.reset_index()
|
| 306 |
+
).sort_values(by="metric", ascending=False)
|
| 307 |
+
for i, row in enumerate(agg.head(5).itertuples(index=False), 1):
|
| 308 |
+
label = getattr(row, g1)
|
| 309 |
+
met = getattr(row, "metric")
|
| 310 |
+
cnt = getattr(row, "count")
|
| 311 |
+
dev = _label_vs_baseline(met, baseline, baseline_band)
|
| 312 |
+
caution = _small_sample_note(int(cnt), min_sample)
|
| 313 |
+
msg = f"{i}. **{label}** β {primary_metric}: {_fmt_num(met)}; {_pluralize('record', int(cnt))}: {cnt} β {dev}"
|
| 314 |
+
if caution:
|
| 315 |
+
msg += f" ({caution})"
|
| 316 |
+
top_lines.append(msg)
|
| 317 |
+
|
| 318 |
+
if top_lines:
|
| 319 |
+
sections.append("## Highest average values by group")
|
| 320 |
+
sections.extend(top_lines)
|
| 321 |
+
sections.append("")
|
| 322 |
+
|
| 323 |
+
# Zone comparison (prefer structured zone table if present)
|
| 324 |
+
zone_lines: List[str] = []
|
| 325 |
+
if isinstance(zone_tbl, pd.DataFrame) and not zone_tbl.empty:
|
| 326 |
+
z = zone_tbl.copy()
|
| 327 |
+
# find zone label & metric columns dynamically
|
| 328 |
+
zone_col = next((c for c in z.columns if "zone" in c.lower()), None)
|
| 329 |
+
zmet_col = next((c for c in z.columns if primary_metric.lower() in c.lower() or "avg" in c.lower()), None)
|
| 330 |
+
zcnt_col = next((c for c in z.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
|
| 331 |
+
|
| 332 |
+
if zone_col and zmet_col:
|
| 333 |
+
# Clean truly missing zones but keep literal "Total" if present
|
| 334 |
+
z[zone_col] = z[zone_col].astype("string")
|
| 335 |
+
keep = (z[zone_col].notna()) | (z[zone_col].str.upper() == "TOTAL")
|
| 336 |
+
z = z[keep]
|
| 337 |
+
z[zmet_col] = _to_numeric(z[zmet_col])
|
| 338 |
+
z = z.sort_values(by=zmet_col, ascending=False)
|
| 339 |
+
|
| 340 |
+
for row in z.itertuples(index=False):
|
| 341 |
+
zone = getattr(row, zone_col)
|
| 342 |
+
met = getattr(row, zmet_col)
|
| 343 |
+
cnt = getattr(row, zcnt_col) if zcnt_col and hasattr(row, zcnt_col) else np.nan
|
| 344 |
+
lab = _label_vs_baseline(met, baseline, baseline_band)
|
| 345 |
+
msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β {lab})"
|
| 346 |
+
if zcnt_col and hasattr(row, zcnt_col) and not pd.isna(cnt):
|
| 347 |
+
msg += f"; n={int(cnt)}"
|
| 348 |
+
zone_lines.append(msg)
|
| 349 |
+
|
| 350 |
+
else:
|
| 351 |
+
# Derive zones dynamically if a zone-like column exists
|
| 352 |
+
zcol = _find_group_col(df, ["zone"])
|
| 353 |
+
if zcol:
|
| 354 |
+
z = df.copy()
|
| 355 |
+
z[zcol] = z[zcol].astype("string").str.strip()
|
| 356 |
+
# drop true NaN zones, but do NOT fabricate totals
|
| 357 |
+
z = z[z[zcol].notna()]
|
| 358 |
+
agg = (
|
| 359 |
+
z.groupby(zcol, dropna=False)[primary_metric]
|
| 360 |
+
.agg(["mean", "count"]).reset_index()
|
| 361 |
+
.rename(columns={"mean": "metric", "count": "count"})
|
| 362 |
+
.sort_values(by="metric", ascending=False)
|
| 363 |
+
)
|
| 364 |
+
for row in agg.itertuples(index=False):
|
| 365 |
+
zone = getattr(row, zcol)
|
| 366 |
+
met = getattr(row, "metric")
|
| 367 |
+
cnt = getattr(row, "count")
|
| 368 |
+
lab = _label_vs_baseline(met, baseline, baseline_band)
|
| 369 |
+
msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β {lab}); n={cnt}"
|
| 370 |
+
zone_lines.append(msg)
|
| 371 |
+
|
| 372 |
+
if zone_lines:
|
| 373 |
+
sections.append(f"## {( 'Zone' if 'zone' in ''.join(df.columns).lower() else 'Category')} comparison vs overall")
|
| 374 |
+
sections.extend(zone_lines)
|
| 375 |
+
sections.append("")
|
| 376 |
+
|
| 377 |
+
# Geographic notes β map Top-5 facilities only (if we have both Top-5 and ODHF df)
|
| 378 |
+
geo_lines: List[str] = []
|
| 379 |
+
if isinstance(top_fac, pd.DataFrame) and not top_fac.empty and isinstance(odhf_df, pd.DataFrame) and not odhf_df.empty:
|
| 380 |
+
fac_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
|
| 381 |
+
if fac_col:
|
| 382 |
+
mapped = _map_top_facilities_to_odhf(top_fac.head(5), odhf_df, fac_col=fac_col, odhf_name_col=next(
|
| 383 |
+
(c for c in odhf_df.columns if c.lower() == "facility_name"), "facility_name"
|
| 384 |
+
))
|
| 385 |
+
if not mapped.empty:
|
| 386 |
+
for r in mapped.head(3).to_dict(orient="records"):
|
| 387 |
+
f = r.get("Facility")
|
| 388 |
+
city = r.get("city")
|
| 389 |
+
geo_lines.append(f"- **{f}** ({city}) is among the highest-average groups; consider capacity and referral patterns.")
|
| 390 |
+
if geo_lines:
|
| 391 |
+
sections.append("## Geographic notes")
|
| 392 |
+
sections.extend(geo_lines)
|
| 393 |
+
sections.append("")
|
| 394 |
+
|
| 395 |
+
# Recommendations β grounded in the above
|
| 396 |
+
recs: List[str] = []
|
| 397 |
+
if top_lines:
|
| 398 |
+
recs.append("Prioritize operating room time and staffing for the highest-average groups, especially those with substantial volume.")
|
| 399 |
+
if comparator_metric:
|
| 400 |
+
recs.append(f"Track **{comparator_metric}** alongside {primary_metric} to identify upstream bottlenecks (e.g., long consult waits driving surgical delays).")
|
| 401 |
+
if zone_lines:
|
| 402 |
+
recs.append("Address zones persistently above the provincial baseline; deploy targeted resources and load balancing across facilities.")
|
| 403 |
+
recs.append("Apply small-sample caution; pool or validate categories with very few records before acting on outliers.")
|
| 404 |
+
recs.append("Standardize specialty/facility naming to reduce coding-induced variance in aggregates.")
|
| 405 |
+
|
| 406 |
+
sections.append("## Recommendations (Auto-generated)")
|
| 407 |
+
for r in recs:
|
| 408 |
+
sections.append(f"- {r}")
|
| 409 |
|
| 410 |
+
return "\n".join(sections).strip()
|