Rajan Sharma commited on
Commit
1c8ef92
·
verified ·
1 Parent(s): a10f7d6

Update auto_metrics.py

Browse files
Files changed (1) hide show
  1. auto_metrics.py +396 -108
auto_metrics.py CHANGED
@@ -1,9 +1,10 @@
1
  from __future__ import annotations
2
- from typing import Dict, Any, Tuple, Optional, List
3
  import pandas as pd
4
  import numpy as np
5
  from data_registry import DataRegistry
6
  from schema_mapper import MappingResult
 
7
 
8
  def _get(reg: DataRegistry, mapping: MappingResult, concept: str) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
9
  if concept not in mapping.resolved:
@@ -11,6 +12,11 @@ def _get(reg: DataRegistry, mapping: MappingResult, concept: str) -> Tuple[Optio
11
  tname, col = mapping.resolved[concept]
12
  return reg.get(tname), col
13
 
 
 
 
 
 
14
  def _fmt_tbl(df: pd.DataFrame, max_rows: int = 20) -> str:
15
  if df is None or df.empty:
16
  return "_<empty table>_"
@@ -19,127 +25,409 @@ def _fmt_tbl(df: pd.DataFrame, max_rows: int = 20) -> str:
19
  df2 = df2.head(max_rows)
20
  return df2.to_markdown(index=False)
21
 
22
- def compute_facility_wait_ranks(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
23
- df_fac, col_fac = _get(reg, mapping, "facility")
24
- if df_fac is None or col_fac is None:
25
- return None
26
- wait_col = None
27
- for key in ("wait_median", "wait_days", "wait_p90"):
28
- dfw, colw = _get(reg, mapping, key)
29
- if dfw is not None and colw is not None and dfw is df_fac:
30
- wait_col = colw
31
- break
32
- if wait_col is None:
33
- return None
34
- g = df_fac.groupby(col_fac, dropna=True)[wait_col].apply(pd.to_numeric, errors="coerce").mean().reset_index()
35
- g = g.rename(columns={wait_col: "avg_wait"})
36
- g = g.sort_values("avg_wait", ascending=False)
37
- g["rank"] = np.arange(1, len(g) + 1)
38
- return g[[col_fac, "avg_wait", "rank"]]
39
 
40
- def compute_specialty_wait_ranks(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
41
- df, col_spec = _get(reg, mapping, "specialty")
42
- if df is None or col_spec is None:
43
- return None
44
- wait_col = None
45
- for key in ("wait_median", "wait_days", "wait_p90"):
46
- dfw, colw = _get(reg, mapping, key)
47
- if dfw is not None and colw is not None and dfw is df:
48
- wait_col = colw
49
- break
50
- if wait_col is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  return None
52
- g = df.groupby(col_spec, dropna=True)[wait_col].apply(pd.to_numeric, errors="coerce").mean().reset_index()
53
- g = g.rename(columns={wait_col: "avg_wait"})
54
- g = g.sort_values("avg_wait", ascending=False)
55
- g["rank"] = np.arange(1, len(g) + 1)
56
- return g[[col_spec, "avg_wait", "rank"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- def compute_zone_comparison(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
59
- df, col_zone = _get(reg, mapping, "zone")
60
- if df is None or col_zone is None:
 
 
 
61
  return None
62
- wait_col = None
63
- for key in ("wait_median", "wait_days", "wait_p90"):
64
- dfw, colw = _get(reg, mapping, key)
65
- if dfw is not None and colw is not None and dfw is df:
66
- wait_col = colw
67
- break
68
- if wait_col is None:
 
 
 
 
 
69
  return None
70
- g = df.groupby(col_zone, dropna=True)[wait_col].apply(pd.to_numeric, errors="coerce").mean().reset_index()
71
- g = g.rename(columns={wait_col: "avg_wait"})
72
- g = g.sort_values("avg_wait", ascending=False)
73
- return g[[col_zone, "avg_wait"]]
74
-
75
- def compute_capacity_snapshot(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
76
- df, col_beds = _get(reg, mapping, "capacity_beds")
77
- if df is None or col_beds is None:
78
  return None
79
- s = pd.to_numeric(df[col_beds], errors="coerce")
80
- out = pd.DataFrame({
81
- "metric": ["staffed_beds_total", "staffed_beds_mean"],
82
- "value": [int(np.nansum(s)), float(np.nanmean(s))]
 
 
 
83
  })
84
- return out
 
 
 
 
 
 
 
 
 
 
85
 
86
- def compute_costs_example(reg: DataRegistry, mapping: MappingResult, n_clients: int = 1200) -> Optional[pd.DataFrame]:
87
- dfF, colF = _get(reg, mapping, "cost_fixed")
88
- dfV, colV = _get(reg, mapping, "cost_variable")
89
- if colV is None and colF is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  return None
91
- fixed = float(pd.to_numeric(dfF[colF], errors="coerce").sum()) if (dfF is not None and colF is not None) else 0.0
92
- var = float(pd.to_numeric(dfV[colV], errors="coerce").mean()) if (dfV is not None and colV is not None) else np.nan
93
- total = fixed + (var * n_clients if np.isfinite(var) else np.nan)
94
- return pd.DataFrame({
95
- "component": ["fixed_total", "variable_per_client", f"program_total_for_{n_clients}"],
96
- "value": [fixed, var, total]
 
 
 
 
 
 
 
 
 
97
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- def build_data_findings_markdown(reg: DataRegistry, mapping: MappingResult, topn: int = 5):
100
- missing: List[str] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- fac = compute_facility_wait_ranks(reg, mapping)
103
- if fac is None or fac.empty:
104
- missing.append("facility_wait_ranks")
105
- fac_md = "_Not available (need facility + wait columns in the same table)._"
106
- else:
107
- fac_md = fac.head(topn).to_markdown(index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- spec = compute_specialty_wait_ranks(reg, mapping)
110
- if spec is None or spec.empty:
111
- missing.append("specialty_wait_ranks")
112
- spec_md = "_Not available (need specialty + wait columns in the same table)._"
113
- else:
114
- spec_md = spec.head(topn).to_markdown(index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- zone = compute_zone_comparison(reg, mapping)
117
- if zone is None or zone.empty:
118
- missing.append("zone_wait_comparison")
119
- zone_md = "_Not available (need zone + wait columns)._"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  else:
121
- zone_md = zone.to_markdown(index=False)
122
-
123
- cap = compute_capacity_snapshot(reg, mapping)
124
- if cap is None or cap.empty:
125
- missing.append("capacity_snapshot")
126
- cap_md = "_Not available (need staffed beds column)._"
127
  else:
128
- cap_md = cap.to_markdown(index=False)
129
-
130
- costs = compute_costs_example(reg, mapping, n_clients=1200)
131
- if costs is None or costs.empty:
132
- missing.append("costs")
133
- costs_md = "_Not available (need fixed/variable costs)._"
 
 
 
 
 
 
 
 
134
  else:
135
- costs_md = costs.to_markdown(index=False)
136
-
137
- md = (
138
- "### Data-Derived Findings (computed in Python)\n\n"
139
- "**Top Facilities by Avg Wait**\n\n" + fac_md + "\n\n"
140
- "**Top Specialties by Avg Wait**\n\n" + spec_md + "\n\n"
141
- "**Zone Comparison (Avg Wait)**\n\n" + zone_md + "\n\n"
142
- "**Capacity Snapshot**\n\n" + cap_md + "\n\n"
143
- "**Cost Illustration (for 1,200 clients)**\n\n" + costs_md + "\n"
144
- )
145
- return md, missing
 
1
  from __future__ import annotations
2
+ from typing import Dict, Any, Tuple, Optional, List, Union
3
  import pandas as pd
4
  import numpy as np
5
  from data_registry import DataRegistry
6
  from schema_mapper import MappingResult
7
+ import re
8
 
9
  def _get(reg: DataRegistry, mapping: MappingResult, concept: str) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
10
  if concept not in mapping.resolved:
 
12
  tname, col = mapping.resolved[concept]
13
  return reg.get(tname), col
14
 
15
+ def _clean_numeric_series(series: pd.Series) -> pd.Series:
16
+ """Clean numeric data, handling various missing value representations."""
17
+ cleaned = series.replace(['', '—', '-', 'null', 'NULL', 'N/A', 'n/a', ' ', 'nan'], np.nan)
18
+ return pd.to_numeric(cleaned, errors='coerce')
19
+
20
  def _fmt_tbl(df: pd.DataFrame, max_rows: int = 20) -> str:
21
  if df is None or df.empty:
22
  return "_<empty table>_"
 
25
  df2 = df2.head(max_rows)
26
  return df2.to_markdown(index=False)
27
 
28
+ def _detect_numeric_columns(df: pd.DataFrame) -> List[str]:
29
+ """Detect columns that contain numeric data (even if stored as strings)."""
30
+ numeric_cols = []
31
+ for col in df.columns:
32
+ # Try to convert a sample to numeric
33
+ sample = df[col].dropna().head(100)
34
+ if len(sample) > 0:
35
+ numeric_sample = pd.to_numeric(sample, errors='coerce')
36
+ # If more than 50% can be converted to numeric, consider it numeric
37
+ if numeric_sample.notna().sum() > len(sample) * 0.5:
38
+ numeric_cols.append(col)
39
+ return numeric_cols
 
 
 
 
 
40
 
41
+ def _detect_categorical_columns(df: pd.DataFrame, max_unique_ratio: float = 0.3) -> List[str]:
42
+ """Detect categorical columns with reasonable number of unique values."""
43
+ categorical_cols = []
44
+ for col in df.columns:
45
+ if df[col].dtype == 'object': # String-like columns
46
+ unique_ratio = df[col].nunique() / len(df)
47
+ # If unique ratio is low, likely categorical
48
+ if 0 < unique_ratio <= max_unique_ratio:
49
+ categorical_cols.append(col)
50
+ return categorical_cols
51
+
52
+ def _find_best_grouping_column(df: pd.DataFrame, preferred_patterns: List[str] = None) -> Optional[str]:
53
+ """Find the best column to group by based on healthcare patterns and characteristics."""
54
+ if preferred_patterns is None:
55
+ preferred_patterns = [
56
+ r'facility|hospital|clinic|center|centre|institution|provider|site|location',
57
+ r'specialty|service|department|unit|division|program|type|category',
58
+ r'zone|region|area|district|network|system|catchment',
59
+ r'practitioner|physician|doctor|nurse|staff',
60
+ r'procedure|treatment|intervention|therapy|service_type',
61
+ r'name|id|identifier'
62
+ ]
63
+
64
+ categorical_cols = _detect_categorical_columns(df)
65
+
66
+ # Score columns based on pattern matching and characteristics
67
+ scored_cols = []
68
+ for col in categorical_cols:
69
+ score = 0
70
+ col_lower = col.lower()
71
+
72
+ # Pattern matching score
73
+ for i, pattern in enumerate(preferred_patterns):
74
+ if re.search(pattern, col_lower):
75
+ score += (len(preferred_patterns) - i) * 10 # Higher score for earlier patterns
76
+ break
77
+
78
+ # Characteristics score
79
+ unique_count = df[col].nunique()
80
+ total_count = len(df)
81
+
82
+ # Prefer columns with reasonable number of groups (not too few, not too many)
83
+ if 2 <= unique_count <= min(50, total_count // 5):
84
+ score += 5
85
+
86
+ # Prefer columns with less missing data
87
+ missing_ratio = df[col].isna().sum() / len(df)
88
+ score += (1 - missing_ratio) * 3
89
+
90
+ scored_cols.append((col, score))
91
+
92
+ if scored_cols:
93
+ scored_cols.sort(key=lambda x: x[1], reverse=True)
94
+ return scored_cols[0][0]
95
+
96
+ return None
97
+
98
+ def _find_best_metric_column(df: pd.DataFrame, grouping_col: str = None) -> Optional[str]:
99
+ """Find the best numeric column to analyze as a healthcare metric."""
100
+ numeric_cols = _detect_numeric_columns(df)
101
+
102
+ if not numeric_cols:
103
  return None
104
+
105
+ # Healthcare-relevant metric patterns
106
+ healthcare_metric_patterns = [
107
+ r'wait|delay|time|duration|length',
108
+ r'cost|price|expense|fee|charge|budget',
109
+ r'volume|count|number|quantity|throughput|capacity',
110
+ r'rate|ratio|percent|percentage|score|index',
111
+ r'outcome|result|mortality|morbidity|readmission',
112
+ r'satisfaction|quality|performance|efficiency',
113
+ r'utilization|occupancy|availability',
114
+ r'median|mean|average|percentile|p\d+|90th|95th'
115
+ ]
116
+
117
+ # Score numeric columns
118
+ scored_cols = []
119
+ for col in numeric_cols:
120
+ score = 0
121
+ col_lower = col.lower()
122
+
123
+ # Prefer columns with healthcare-relevant names
124
+ for pattern in healthcare_metric_patterns:
125
+ if re.search(pattern, col_lower):
126
+ score += 10
127
+ break
128
+
129
+ # Prefer columns with reasonable variance
130
+ try:
131
+ clean_series = _clean_numeric_series(df[col])
132
+ if not clean_series.isna().all():
133
+ std_dev = clean_series.std()
134
+ mean_val = clean_series.mean()
135
+ if mean_val != 0 and std_dev / abs(mean_val) > 0.1: # Coefficient of variation > 0.1
136
+ score += 5
137
+ except:
138
+ pass
139
+
140
+ # Prefer columns with less missing data
141
+ missing_ratio = df[col].isna().sum() / len(df)
142
+ score += (1 - missing_ratio) * 3
143
+
144
+ scored_cols.append((col, score))
145
+
146
+ if scored_cols:
147
+ scored_cols.sort(key=lambda x: x[1], reverse=True)
148
+ return scored_cols[0][0]
149
+
150
+ return None
151
 
152
+ def compute_generic_rankings(reg: DataRegistry, mapping: MappingResult,
153
+ entity_concept: str, metric_concept: str,
154
+ ranking_name: str) -> Optional[pd.DataFrame]:
155
+ """Generic function to compute rankings for any healthcare entity by any metric."""
156
+ df, entity_col = _get(reg, mapping, entity_concept)
157
+ if df is None or entity_col is None:
158
  return None
159
+
160
+ # Find metric column
161
+ metric_col = None
162
+ df_metric, mapped_metric_col = _get(reg, mapping, metric_concept)
163
+
164
+ if df_metric is not None and mapped_metric_col is not None and df_metric is df:
165
+ metric_col = mapped_metric_col
166
+ else:
167
+ # Fallback: find best numeric column
168
+ metric_col = _find_best_metric_column(df, entity_col)
169
+
170
+ if metric_col is None:
171
  return None
172
+
173
+ # Clean the data
174
+ df_clean = df[df[entity_col].notna() & (df[entity_col] != '') & (df[entity_col].astype(str).str.strip() != '')].copy()
175
+ df_clean[metric_col] = _clean_numeric_series(df_clean[metric_col])
176
+ df_clean = df_clean[df_clean[metric_col].notna()]
177
+
178
+ if df_clean.empty:
 
179
  return None
180
+
181
+ # Group and calculate statistics
182
+ grouped = df_clean.groupby(entity_col, dropna=True)[metric_col].agg(['mean', 'count', 'std']).reset_index()
183
+ grouped = grouped.rename(columns={
184
+ 'mean': f'avg_{metric_concept}',
185
+ 'count': 'record_count',
186
+ 'std': f'std_{metric_concept}'
187
  })
188
+
189
+ # Sort by average metric (adjust based on whether higher or lower is better)
190
+ # For healthcare metrics like wait times, errors, costs - higher is typically worse
191
+ grouped = grouped.sort_values(f'avg_{metric_concept}', ascending=False)
192
+ grouped['rank'] = np.arange(1, len(grouped) + 1)
193
+
194
+ # Round numeric columns
195
+ numeric_cols = grouped.select_dtypes(include=[np.number]).columns
196
+ grouped[numeric_cols] = grouped[numeric_cols].round(1)
197
+
198
+ return grouped
199
 
200
+ def compute_comparative_analysis(reg: DataRegistry, mapping: MappingResult,
201
+ grouping_concept: str, metric_concept: str) -> Optional[pd.DataFrame]:
202
+ """Generic function to compare healthcare metrics across different groups."""
203
+ df, group_col = _get(reg, mapping, grouping_concept)
204
+ if df is None or group_col is None:
205
+ return None
206
+
207
+ # Find metric column
208
+ metric_col = None
209
+ df_metric, mapped_metric_col = _get(reg, mapping, metric_concept)
210
+
211
+ if df_metric is not None and mapped_metric_col is not None and df_metric is df:
212
+ metric_col = mapped_metric_col
213
+ else:
214
+ metric_col = _find_best_metric_column(df, group_col)
215
+
216
+ if metric_col is None:
217
  return None
218
+
219
+ # Clean data
220
+ df_clean = df[df[group_col].notna() & (df[group_col] != '')].copy()
221
+ df_clean[metric_col] = _clean_numeric_series(df_clean[metric_col])
222
+ df_clean = df_clean[df_clean[metric_col].notna()]
223
+
224
+ if df_clean.empty:
225
+ return None
226
+
227
+ # Group and analyze
228
+ grouped = df_clean.groupby(group_col, dropna=True)[metric_col].agg(['mean', 'count', 'std']).reset_index()
229
+ grouped = grouped.rename(columns={
230
+ 'mean': f'avg_{metric_concept}',
231
+ 'count': 'record_count',
232
+ 'std': f'std_{metric_concept}'
233
  })
234
+
235
+ # Calculate overall average for comparison
236
+ overall_avg = df_clean[metric_col].mean()
237
+ grouped['vs_overall_avg'] = (grouped[f'avg_{metric_concept}'] - overall_avg).round(1)
238
+
239
+ # Sort by average metric
240
+ grouped = grouped.sort_values(f'avg_{metric_concept}', ascending=False)
241
+
242
+ # Round numeric columns
243
+ numeric_cols = grouped.select_dtypes(include=[np.number]).columns
244
+ grouped[numeric_cols] = grouped[numeric_cols].round(1)
245
+
246
+ return grouped
247
 
248
+ def compute_capacity_metrics(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
249
+ """Compute healthcare capacity-related metrics if available."""
250
+ capacity_concepts = [
251
+ 'capacity', 'beds', 'staffed_beds', 'occupied_beds', 'available_beds',
252
+ 'volume', 'throughput', 'utilization', 'occupancy',
253
+ 'appointments', 'procedures', 'admissions', 'discharges',
254
+ 'staffing', 'fte', 'personnel'
255
+ ]
256
+
257
+ results = []
258
+ for concept in capacity_concepts:
259
+ df, col = _get(reg, mapping, concept)
260
+ if df is not None and col is not None:
261
+ clean_series = _clean_numeric_series(df[col])
262
+ if not clean_series.isna().all():
263
+ results.append({
264
+ 'metric': f'{concept}_total',
265
+ 'value': float(np.nansum(clean_series))
266
+ })
267
+ results.append({
268
+ 'metric': f'{concept}_average',
269
+ 'value': float(np.nanmean(clean_series))
270
+ })
271
+ results.append({
272
+ 'metric': f'{concept}_records',
273
+ 'value': int((~clean_series.isna()).sum())
274
+ })
275
+
276
+ if results:
277
+ return pd.DataFrame(results)
278
+ return None
279
 
280
+ def compute_cost_metrics(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
281
+ """Compute healthcare cost-related metrics if available."""
282
+ cost_concepts = [
283
+ 'cost', 'price', 'expense', 'fee', 'charge', 'budget', 'funding',
284
+ 'fixed_cost', 'variable_cost', 'operational_cost', 'capital_cost',
285
+ 'reimbursement', 'revenue', 'billing', 'payment'
286
+ ]
287
+
288
+ results = []
289
+ for concept in cost_concepts:
290
+ df, col = _get(reg, mapping, concept)
291
+ if df is not None and col is not None:
292
+ clean_series = _clean_numeric_series(df[col])
293
+ if not clean_series.isna().all():
294
+ results.append({
295
+ 'component': f'{concept}_total',
296
+ 'value': float(np.nansum(clean_series))
297
+ })
298
+ results.append({
299
+ 'component': f'{concept}_average',
300
+ 'value': float(np.nanmean(clean_series))
301
+ })
302
+
303
+ if results:
304
+ return pd.DataFrame(results)
305
+ return None
306
 
307
+ def auto_discover_healthcare_analysis_opportunities(reg: DataRegistry) -> Dict[str, List[str]]:
308
+ """Automatically discover what healthcare analyses are possible with the available data."""
309
+ opportunities = {
310
+ 'provider_rankings': [],
311
+ 'service_comparisons': [],
312
+ 'regional_analysis': [],
313
+ 'outcome_metrics': [],
314
+ 'efficiency_metrics': []
315
+ }
316
+
317
+ for table_name, df in reg._tables.items():
318
+ if df.empty:
319
+ continue
320
+
321
+ # Find potential healthcare grouping columns
322
+ categorical_cols = _detect_categorical_columns(df)
323
+ numeric_cols = _detect_numeric_columns(df)
324
+
325
+ # Healthcare-specific categorization
326
+ provider_cols = [col for col in categorical_cols if re.search(r'facility|hospital|clinic|provider', col.lower())]
327
+ service_cols = [col for col in categorical_cols if re.search(r'specialty|service|department|procedure', col.lower())]
328
+ regional_cols = [col for col in categorical_cols if re.search(r'zone|region|area|district', col.lower())]
329
+
330
+ outcome_cols = [col for col in numeric_cols if re.search(r'outcome|mortality|readmission|infection|complication', col.lower())]
331
+ efficiency_cols = [col for col in numeric_cols if re.search(r'wait|time|throughput|utilization|length_of_stay', col.lower())]
332
+
333
+ # Suggest healthcare-specific analyses
334
+ for provider_col in provider_cols[:2]:
335
+ for metric_col in (efficiency_cols + outcome_cols)[:2]:
336
+ opportunities['provider_rankings'].append(f"{provider_col} by {metric_col}")
337
+
338
+ for service_col in service_cols[:2]:
339
+ for metric_col in (efficiency_cols + outcome_cols)[:2]:
340
+ opportunities['service_comparisons'].append(f"{metric_col} across {service_col}")
341
+
342
+ for regional_col in regional_cols[:2]:
343
+ for metric_col in (efficiency_cols + outcome_cols)[:2]:
344
+ opportunities['regional_analysis'].append(f"{metric_col} by {regional_col}")
345
+
346
+ opportunities['outcome_metrics'].extend(outcome_cols[:3])
347
+ opportunities['efficiency_metrics'].extend(efficiency_cols[:3])
348
+
349
+ return opportunities
350
 
351
+ def build_data_findings_markdown(reg: DataRegistry, mapping: MappingResult, topn: int = 5):
352
+ """Build generic healthcare data analysis report based on available data and mappings."""
353
+ missing: List[str] = []
354
+ sections = []
355
+
356
+ # Auto-discover healthcare analysis opportunities
357
+ opportunities = auto_discover_healthcare_analysis_opportunities(reg)
358
+
359
+ # Healthcare-specific analysis patterns
360
+ analysis_patterns = [
361
+ ('provider rankings', ['facility', 'provider', 'hospital', 'clinic'], ['wait_time', 'wait_median', 'wait_days', 'wait_p90', 'cost', 'outcome']),
362
+ ('service analysis', ['specialty', 'service', 'department', 'procedure', 'treatment'], ['wait_time', 'wait_median', 'wait_days', 'cost', 'outcome']),
363
+ ('regional comparison', ['zone', 'region', 'area', 'district', 'network'], ['wait_time', 'wait_median', 'cost', 'outcome']),
364
+ ('quality metrics', ['facility', 'service'], ['mortality', 'readmission', 'infection', 'complication', 'satisfaction']),
365
+ ]
366
+
367
+ for analysis_name, entity_concepts, metric_concepts in analysis_patterns:
368
+ found_analysis = False
369
+ for entity_concept in entity_concepts:
370
+ for metric_concept in metric_concepts:
371
+ result = compute_generic_rankings(reg, mapping, entity_concept, metric_concept, analysis_name)
372
+ if result is not None and not result.empty:
373
+ sections.append(f"**Top {entity_concept.title()} by {metric_concept.replace('_', ' ').title()}**\n\n{_fmt_tbl(result.head(topn))}")
374
+ found_analysis = True
375
+ break
376
+ if found_analysis:
377
+ break
378
+
379
+ if not found_analysis:
380
+ missing.append(analysis_name)
381
+
382
+ # Healthcare-specific comparative analyses
383
+ comparison_patterns = [
384
+ ('regional_performance', ['zone', 'region', 'area', 'district'], ['wait_time', 'wait_median', 'cost', 'outcome']),
385
+ ('service_performance', ['specialty', 'service', 'department'], ['wait_time', 'wait_median', 'cost', 'outcome']),
386
+ ('provider_comparison', ['facility', 'hospital', 'clinic'], ['efficiency', 'utilization', 'throughput']),
387
+ ]
388
+
389
+ for analysis_name, group_concepts, metric_concepts in comparison_patterns:
390
+ found_analysis = False
391
+ for group_concept in group_concepts:
392
+ for metric_concept in metric_concepts:
393
+ result = compute_comparative_analysis(reg, mapping, group_concept, metric_concept)
394
+ if result is not None and not result.empty:
395
+ sections.append(f"**{group_concept.title()} Performance Comparison**\n\n{_fmt_tbl(result)}")
396
+ found_analysis = True
397
+ break
398
+ if found_analysis:
399
+ break
400
+
401
+ if not found_analysis:
402
+ missing.append(analysis_name)
403
+
404
+ # Healthcare capacity analysis
405
+ capacity = compute_capacity_metrics(reg, mapping)
406
+ if capacity is not None and not capacity.empty:
407
+ sections.append(f"**Healthcare Capacity Analysis**\n\n{_fmt_tbl(capacity)}")
408
  else:
409
+ missing.append("capacity_analysis")
410
+
411
+ # Healthcare cost analysis
412
+ costs = compute_cost_metrics(reg, mapping)
413
+ if costs is not None and not costs.empty:
414
+ sections.append(f"**Healthcare Cost Analysis**\n\n{_fmt_tbl(costs)}")
415
  else:
416
+ missing.append("cost_analysis")
417
+
418
+ # Build final healthcare report
419
+ if sections:
420
+ md = (
421
+ "### Healthcare Data Analysis Results\n\n" +
422
+ "\n\n".join(sections) +
423
+ "\n\n**Clinical Data Quality Notes**\n"
424
+ "- Analysis performed on available healthcare data columns\n"
425
+ "- Missing values and empty entries excluded from calculations\n"
426
+ "- Numeric values rounded to 1 decimal place for clinical relevance\n"
427
+ "- Rankings prioritize areas that may require clinical attention or resource allocation\n"
428
+ "- Record counts indicate data volume and statistical reliability\n"
429
+ )
430
  else:
431
+ md = "### Healthcare Data Analysis Results\n\nNo analyzable healthcare patterns found in the provided data. Consider uploading data with healthcare facility, service, or outcome metrics."
432
+
433
+ return md, missing