Rajan Sharma commited on
Commit
5651d3e
·
verified ·
1 Parent(s): 87088be

Update narrative_safetynet.py

Browse files
Files changed (1) hide show
  1. narrative_safetynet.py +93 -222
narrative_safetynet.py CHANGED
@@ -1,262 +1,133 @@
1
  # narrative_safetynet.py
2
  from __future__ import annotations
3
- from typing import Dict, Any, List, Optional
 
4
  import math
5
  import numpy as np
6
  import pandas as pd
7
- import re
8
 
9
- _DEF_MIN_SAMPLE = 5 # threshold for "interpret with caution" (fully generic)
 
 
 
 
 
 
 
 
 
10
 
11
- def _is_numeric(s: pd.Series) -> bool:
12
- return pd.api.types.is_numeric_dtype(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def _fmt_num(x: Any, decimals: int = 1) -> str:
15
  try:
16
  if x is None or (isinstance(x, float) and math.isnan(x)):
17
  return "n/a"
18
- if isinstance(x, (int, np.integer)):
19
- return f"{x:,}"
20
  return f"{float(x):,.{decimals}f}"
21
  except Exception:
22
  return str(x)
23
 
24
- def _pick_numeric(df: pd.DataFrame, hints: List[str]) -> Optional[str]:
25
- # choose a numeric column; prefer hinted names
26
- cols = list(df.columns)
 
 
 
 
27
  for h in hints:
28
- for c in cols:
29
- if h.lower() in c.lower() and _is_numeric(df[c]):
30
- return c
31
- for c in cols:
32
- if _is_numeric(df[c]):
33
- return c
34
- return None
35
 
36
- def _find_group_col(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  cols = list(df.columns)
 
38
  for cand in candidates:
39
  for c in cols:
40
- if cand.lower() in c.lower():
 
41
  return c
42
- # fallback: first reasonable categorical column
43
  obj_cols = [c for c in cols if df[c].dtype == "object"]
44
  for c in obj_cols:
45
  nuniq = df[c].nunique(dropna=True)
46
- if 1 < nuniq < max(50, len(df) // 10):
47
  return c
48
  return None
49
 
50
- def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
51
- dff = df.copy()
52
- for c in dff.columns:
53
- if dff[c].dtype == "object":
54
- dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
55
- return dff
56
-
57
- def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
58
- return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
59
 
60
- def _deviation_label(x: float, mu: float, tol: float = 0.01) -> str:
61
- if np.isnan(x) or np.isnan(mu) or mu == 0:
62
  return "unknown"
63
  rel = (x - mu) / mu
64
- if rel > 0.05:
65
  return "higher than average"
66
- if rel < -0.05:
67
  return "lower than average"
68
- if abs(rel) <= max(tol, 0.05):
69
- return "about average"
70
  return "about average"
71
 
72
- def _pluralize(label: str, n: int) -> str:
73
- return f"{label}{'' if n==1 else 's'}"
74
-
75
- def build_narrative(
76
- scenario_text: str,
77
- datasets: Dict[str, Any],
78
- structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
79
- metric_hints: Optional[List[str]] = None,
80
- group_hints: Optional[List[str]] = None,
81
- min_sample: int = _DEF_MIN_SAMPLE
82
- ) -> str:
83
- """
84
- Scenario-agnostic narrative fallback:
85
- - Picks numeric metric & groupings dynamically
86
- - Computes overall baseline + deviations
87
- - Warns on small samples
88
- - Optional geographic notes if city/lat/lon exist
89
- """
90
- metric_hints = metric_hints or ["surgery_median", "consult_median", "wait", "median", "p90", "90th"]
91
- group_hints = group_hints or ["facility", "specialty", "zone", "hospital", "city", "region"]
92
-
93
- # 1) choose first non-empty table-like dataset
94
- df = None
95
- df_key = None
96
- for k, v in datasets.items():
97
- if isinstance(v, pd.DataFrame) and not v.empty:
98
- df = _nanlike_to_nan(v)
99
- df_key = k
100
- break
101
- if df is None:
102
- return "No tabular data available. Unable to generate a narrative."
103
-
104
- # 2) metrics
105
- primary_metric = _pick_numeric(df, metric_hints) # e.g., Surgery_Median
106
- if not primary_metric:
107
- return "No numeric metric found to summarize; please ensure at least one numeric wait-time column is present."
108
-
109
- other_numeric = [c for c in df.columns if _is_numeric(df[c]) and c != primary_metric]
110
- comparator_metric = next(
111
- (c for c in other_numeric if any(h in c.lower() for h in ["consult", "wait", "median", "p90", "90th"])),
112
- None
113
- )
114
-
115
- # 3) groups
116
- group1 = _find_group_col(df, group_hints) # e.g., Facility
117
- group2 = None
118
- if group1:
119
- alt_hints = [h for h in group_hints if h.lower() not in group1.lower()]
120
- group2 = _find_group_col(df.drop(columns=[group1], errors="ignore"), alt_hints)
121
-
122
- # 4) baseline + grouped
123
- baseline = pd.to_numeric(df[primary_metric], errors="coerce").mean(skipna=True)
124
-
125
- def _group_stats(col: str) -> Optional[pd.DataFrame]:
126
- if not col:
127
- return None
128
- tmp = df.copy()
129
- tmp[primary_metric] = pd.to_numeric(tmp[primary_metric], errors="coerce")
130
- comp_col = comparator_metric or primary_metric
131
- if comp_col in tmp.columns:
132
- tmp[comp_col] = pd.to_numeric(tmp[comp_col], errors="coerce")
133
- agg = (
134
- tmp.groupby(col, dropna=False)
135
- .agg(
136
- metric=(primary_metric, "mean"),
137
- count=(primary_metric, "count"),
138
- comp=(comp_col, "mean") if comp_col in tmp.columns else (primary_metric, "mean"),
139
- )
140
- .reset_index()
141
- )
142
- return agg
143
-
144
- g1 = _group_stats(group1)
145
- g2 = _group_stats(group2)
146
-
147
- # 5) Top groups (by primary metric) from group1
148
- top_lines: List[str] = []
149
- if isinstance(g1, pd.DataFrame) and not g1.empty:
150
- g1 = g1.sort_values(by="metric", ascending=False)
151
- k = min(5, len(g1))
152
- for i, row in enumerate(g1.head(k).itertuples(index=False), 1):
153
- label = getattr(row, group1)
154
- metric = getattr(row, "metric")
155
- comp = getattr(row, "comp")
156
- cnt = getattr(row, "count")
157
- devlab = _deviation_label(metric, baseline)
158
- caution = _small_sample_note(int(cnt), min_sample)
159
- msg = f"{i}. **{label}** — {primary_metric}: {_fmt_num(metric)}"
160
- if comparator_metric:
161
- msg += f"; {comparator_metric}: {_fmt_num(comp)}"
162
- msg += f"; {_pluralize('record', int(cnt))}: {cnt}"
163
- msg += f" → {devlab}"
164
- if caution:
165
- msg += f" ({caution})"
166
- top_lines.append(msg)
167
-
168
- # 6) Group2 overview
169
- region_lines: List[str] = []
170
- if isinstance(g2, pd.DataFrame) and not g2.empty:
171
- g2 = g2.sort_values(by="metric", ascending=False)
172
- for row in g2.itertuples(index=False):
173
- label = getattr(row, group2)
174
- metric = getattr(row, "metric")
175
- comp = getattr(row, "comp")
176
- cnt = getattr(row, "count")
177
- devlab = _deviation_label(metric, baseline)
178
- caution = _small_sample_note(int(cnt), min_sample)
179
- line = f"- **{label}**: {_fmt_num(metric)} (vs. overall {_fmt_num(baseline)} → {devlab}); n={cnt}"
180
- if comparator_metric:
181
- line += f"; {comparator_metric}: {_fmt_num(comp)}"
182
- if caution:
183
- line += f" — {caution}"
184
- region_lines.append(line)
185
-
186
- # 7) Geographic notes (optional)
187
- geo_notes: List[str] = []
188
- city_col = next((c for c in df.columns if re.search(r"\bcity\b", c, re.I)), None)
189
- lat_col = next((c for c in df.columns if re.search(r"\b(lat|latitude)\b", c, re.I)), None)
190
- lon_col = next((c for c in df.columns if re.search(r"\b(lon|longitude)\b", c, re.I)), None)
191
- if group1 and city_col and (lat_col and lon_col):
192
- if isinstance(g1, pd.DataFrame) and not g1.empty and group1 in df.columns:
193
- top_labels = g1[group1].astype(str).head(10).tolist()
194
- sub = df[df[group1].astype(str).isin(top_labels)].copy()
195
- if not sub.empty:
196
- sub[primary_metric] = pd.to_numeric(sub[primary_metric], errors="coerce")
197
- by_city = (
198
- sub.groupby(city_col, dropna=False)[primary_metric]
199
- .mean()
200
- .reset_index()
201
- .sort_values(by=primary_metric, ascending=False)
202
- )
203
- for r in by_city.head(3).to_dict(orient="records"):
204
- cname = r.get(city_col)
205
- val = r.get(primary_metric)
206
- geo_notes.append(f"- **{cname}** shows higher average {primary_metric} among top groups ({_fmt_num(val)}).")
207
-
208
- # 8) Methodology (auto)
209
- methodology: List[str] = []
210
- na_counts = df.isna().sum().sum()
211
- if na_counts > 0:
212
- methodology.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
213
- methodology.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
214
- if comparator_metric:
215
- methodology.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
216
- if group1:
217
- methodology.append(f"Primary grouping inferred: **{group1}**.")
218
- if group2:
219
- methodology.append(f"Secondary grouping inferred: **{group2}**.")
220
- if min_sample != _DEF_MIN_SAMPLE:
221
- methodology.append(f"Small-sample threshold set to {min_sample} records.")
222
-
223
- # 9) Compose markdown
224
- lines: List[str] = []
225
- lines.append("## Methodology (Auto-generated)")
226
- for m in methodology:
227
- lines.append(f"- {m}")
228
- lines.append("")
229
-
230
- if top_lines:
231
- lines.append("## Highest average values by group")
232
- lines.extend(top_lines)
233
- lines.append("")
234
-
235
- if region_lines:
236
- lines.append(f"## {group2 or 'Region/Category'} comparison vs overall")
237
- lines.extend(region_lines)
238
- lines.append("")
239
 
240
- if geo_notes:
241
- lines.append("## Geographic notes")
242
- lines.extend(geo_notes)
243
- lines.append("")
244
 
245
- recs: List[str] = []
246
- if top_lines:
247
- recs.append("Prioritize resources to the highest-average groups (above overall baseline), especially those with sufficient volume.")
248
- if comparator_metric:
249
- recs.append(f"Cross-check {comparator_metric} trends to identify upstream bottlenecks (e.g., long consult waits pushing surgery waits).")
250
- if isinstance(g2, pd.DataFrame) and not g2.empty:
251
- high = g2[g2["metric"] > baseline]
252
- if not high.empty:
253
- recs.append(f"Address disparities where average **{primary_metric}** exceeds the overall baseline.")
254
- recs.append("For very small groups, validate data quality and consider pooling across similar categories to stabilize estimates.")
255
- recs.append("Validate coding differences (similar specialties or labels spelled differently) to ensure apples-to-apples comparison.")
256
 
257
- lines.append("## Recommendations (Auto-generated)")
258
- for r in recs:
259
- lines.append(f"- {r}")
260
 
261
- return "\n".join(lines).strip()
262
 
 
1
  # narrative_safetynet.py
2
  from __future__ import annotations
3
+ from typing import Dict, Any, List, Optional, Tuple
4
+ import re
5
  import math
6
  import numpy as np
7
  import pandas as pd
 
8
 
9
+ # -------------------- helpers: dtype / formatting --------------------
10
+
11
+ _DEF_MIN_SAMPLE = 5 # generic caution threshold for group sizes
12
+
13
+ _HINT_METRICS_DEFAULT = [
14
+ "surgery_median", "consult_median",
15
+ "surgery_90th", "consult_90th",
16
+ "surgery", "consult",
17
+ "wait", "median", "p90", "90th"
18
+ ]
19
 
20
+ _HINT_GROUPS_DEFAULT = [
21
+ "facility", "specialty", "zone",
22
+ "hospital", "city", "region"
23
+ ]
24
+
25
+ _BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"]
26
+
27
+ def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
28
+ dff = df.copy()
29
+ for c in dff.columns:
30
+ if dff[c].dtype == "object":
31
+ dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
32
+ return dff
33
+
34
+ def _is_numeric_series(s: pd.Series) -> bool:
35
+ try:
36
+ return pd.api.types.is_numeric_dtype(s)
37
+ except Exception:
38
+ return False
39
+
40
+ def _to_numeric(s: pd.Series) -> pd.Series:
41
+ return pd.to_numeric(s, errors="coerce")
42
 
43
  def _fmt_num(x: Any, decimals: int = 1) -> str:
44
  try:
45
  if x is None or (isinstance(x, float) and math.isnan(x)):
46
  return "n/a"
47
+ if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()):
48
+ return f"{int(round(float(x))):,}"
49
  return f"{float(x):,.{decimals}f}"
50
  except Exception:
51
  return str(x)
52
 
53
+ # -------------------- metric & dataset selection (dynamic) --------------------
54
+
55
+ def _score_metric_name(col: str, hints: List[str]) -> int:
56
+ name = (col or "").lower()
57
+ if any(bad in name for bad in _BAD_METRIC_NAMES):
58
+ return -10**6 # disqualify obvious counters/ids
59
+ score = 0
60
  for h in hints:
61
+ if h in name:
62
+ score += 3
63
+ return score
 
 
 
 
64
 
65
+ def _choose_df_and_metric(
66
+ datasets: Dict[str, Any],
67
+ metric_hints: List[str]
68
+ ) -> Optional[Tuple[str, pd.DataFrame, str]]:
69
+ """
70
+ Sweep all dataframes & numeric columns. Pick the (df, metric) with best score:
71
+ +3 per hint match; +1 if non-constant numeric. Disqualify id-like names.
72
+ """
73
+ best: Optional[Tuple[int, str, pd.DataFrame, str]] = None
74
+ for key, v in datasets.items():
75
+ if not isinstance(v, pd.DataFrame) or v.empty:
76
+ continue
77
+ df = _nanlike_to_nan(v)
78
+ for col in df.columns:
79
+ col_num = _to_numeric(df[col])
80
+ if not _is_numeric_series(col_num):
81
+ continue
82
+ s = _score_metric_name(col, metric_hints)
83
+ if col_num.nunique(dropna=True) > 1:
84
+ s += 1
85
+ if best is None or s > best[0]:
86
+ best = (s, key, df, col)
87
+ if best is None:
88
+ return None
89
+ _, key, df, metric = best
90
+ return key, df, metric
91
+
92
+ # -------------------- grouping detection (dynamic) --------------------
93
+
94
+ def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]:
95
+ avoid = [a.lower() for a in (avoid or [])]
96
  cols = list(df.columns)
97
+ # prefer name matches
98
  for cand in candidates:
99
  for c in cols:
100
+ cname = c.lower()
101
+ if cand.lower() in cname and all(a not in cname for a in avoid):
102
  return c
103
+ # fallback: a categorical with reasonable cardinality
104
  obj_cols = [c for c in cols if df[c].dtype == "object"]
105
  for c in obj_cols:
106
  nuniq = df[c].nunique(dropna=True)
107
+ if 1 < nuniq < max(50, len(df)//10):
108
  return c
109
  return None
110
 
111
+ # -------------------- labels & cautions --------------------
 
 
 
 
 
 
 
 
112
 
113
+ def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str:
114
+ if pd.isna(x) or pd.isna(mu) or mu == 0:
115
  return "unknown"
116
  rel = (x - mu) / mu
117
+ if rel > band:
118
  return "higher than average"
119
+ if rel < -band:
120
  return "lower than average"
 
 
121
  return "about average"
122
 
123
+ def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
124
+ return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ def _pluralize(word: str, n: int) -> str:
127
+ return f"{word}{'' if n == 1 else 's'}"
 
 
128
 
129
+ # -------------------- geo join (Top-5 only) --------------------
 
 
 
 
 
 
 
 
 
 
130
 
131
+ def _canon(s: str) -> s_
 
 
132
 
 
133