Spaces:
Build error
Build error
File size: 18,445 Bytes
ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 ed780b7 1c8ef92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 |
from __future__ import annotations
from typing import Dict, Any, Tuple, Optional, List, Union
import pandas as pd
import numpy as np
from data_registry import DataRegistry
from schema_mapper import MappingResult
import re
def _get(reg: DataRegistry, mapping: MappingResult, concept: str) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
if concept not in mapping.resolved:
return None, None
tname, col = mapping.resolved[concept]
return reg.get(tname), col
def _clean_numeric_series(series: pd.Series) -> pd.Series:
"""Clean numeric data, handling various missing value representations."""
cleaned = series.replace(['', '—', '-', 'null', 'NULL', 'N/A', 'n/a', ' ', 'nan'], np.nan)
return pd.to_numeric(cleaned, errors='coerce')
def _fmt_tbl(df: pd.DataFrame, max_rows: int = 20) -> str:
if df is None or df.empty:
return "_<empty table>_"
df2 = df.copy()
if len(df2) > max_rows:
df2 = df2.head(max_rows)
return df2.to_markdown(index=False)
def _detect_numeric_columns(df: pd.DataFrame) -> List[str]:
"""Detect columns that contain numeric data (even if stored as strings)."""
numeric_cols = []
for col in df.columns:
# Try to convert a sample to numeric
sample = df[col].dropna().head(100)
if len(sample) > 0:
numeric_sample = pd.to_numeric(sample, errors='coerce')
# If more than 50% can be converted to numeric, consider it numeric
if numeric_sample.notna().sum() > len(sample) * 0.5:
numeric_cols.append(col)
return numeric_cols
def _detect_categorical_columns(df: pd.DataFrame, max_unique_ratio: float = 0.3) -> List[str]:
"""Detect categorical columns with reasonable number of unique values."""
categorical_cols = []
for col in df.columns:
if df[col].dtype == 'object': # String-like columns
unique_ratio = df[col].nunique() / len(df)
# If unique ratio is low, likely categorical
if 0 < unique_ratio <= max_unique_ratio:
categorical_cols.append(col)
return categorical_cols
def _find_best_grouping_column(df: pd.DataFrame, preferred_patterns: List[str] = None) -> Optional[str]:
"""Find the best column to group by based on healthcare patterns and characteristics."""
if preferred_patterns is None:
preferred_patterns = [
r'facility|hospital|clinic|center|centre|institution|provider|site|location',
r'specialty|service|department|unit|division|program|type|category',
r'zone|region|area|district|network|system|catchment',
r'practitioner|physician|doctor|nurse|staff',
r'procedure|treatment|intervention|therapy|service_type',
r'name|id|identifier'
]
categorical_cols = _detect_categorical_columns(df)
# Score columns based on pattern matching and characteristics
scored_cols = []
for col in categorical_cols:
score = 0
col_lower = col.lower()
# Pattern matching score
for i, pattern in enumerate(preferred_patterns):
if re.search(pattern, col_lower):
score += (len(preferred_patterns) - i) * 10 # Higher score for earlier patterns
break
# Characteristics score
unique_count = df[col].nunique()
total_count = len(df)
# Prefer columns with reasonable number of groups (not too few, not too many)
if 2 <= unique_count <= min(50, total_count // 5):
score += 5
# Prefer columns with less missing data
missing_ratio = df[col].isna().sum() / len(df)
score += (1 - missing_ratio) * 3
scored_cols.append((col, score))
if scored_cols:
scored_cols.sort(key=lambda x: x[1], reverse=True)
return scored_cols[0][0]
return None
def _find_best_metric_column(df: pd.DataFrame, grouping_col: str = None) -> Optional[str]:
"""Find the best numeric column to analyze as a healthcare metric."""
numeric_cols = _detect_numeric_columns(df)
if not numeric_cols:
return None
# Healthcare-relevant metric patterns
healthcare_metric_patterns = [
r'wait|delay|time|duration|length',
r'cost|price|expense|fee|charge|budget',
r'volume|count|number|quantity|throughput|capacity',
r'rate|ratio|percent|percentage|score|index',
r'outcome|result|mortality|morbidity|readmission',
r'satisfaction|quality|performance|efficiency',
r'utilization|occupancy|availability',
r'median|mean|average|percentile|p\d+|90th|95th'
]
# Score numeric columns
scored_cols = []
for col in numeric_cols:
score = 0
col_lower = col.lower()
# Prefer columns with healthcare-relevant names
for pattern in healthcare_metric_patterns:
if re.search(pattern, col_lower):
score += 10
break
# Prefer columns with reasonable variance
try:
clean_series = _clean_numeric_series(df[col])
if not clean_series.isna().all():
std_dev = clean_series.std()
mean_val = clean_series.mean()
if mean_val != 0 and std_dev / abs(mean_val) > 0.1: # Coefficient of variation > 0.1
score += 5
except:
pass
# Prefer columns with less missing data
missing_ratio = df[col].isna().sum() / len(df)
score += (1 - missing_ratio) * 3
scored_cols.append((col, score))
if scored_cols:
scored_cols.sort(key=lambda x: x[1], reverse=True)
return scored_cols[0][0]
return None
def compute_generic_rankings(reg: DataRegistry, mapping: MappingResult,
entity_concept: str, metric_concept: str,
ranking_name: str) -> Optional[pd.DataFrame]:
"""Generic function to compute rankings for any healthcare entity by any metric."""
df, entity_col = _get(reg, mapping, entity_concept)
if df is None or entity_col is None:
return None
# Find metric column
metric_col = None
df_metric, mapped_metric_col = _get(reg, mapping, metric_concept)
if df_metric is not None and mapped_metric_col is not None and df_metric is df:
metric_col = mapped_metric_col
else:
# Fallback: find best numeric column
metric_col = _find_best_metric_column(df, entity_col)
if metric_col is None:
return None
# Clean the data
df_clean = df[df[entity_col].notna() & (df[entity_col] != '') & (df[entity_col].astype(str).str.strip() != '')].copy()
df_clean[metric_col] = _clean_numeric_series(df_clean[metric_col])
df_clean = df_clean[df_clean[metric_col].notna()]
if df_clean.empty:
return None
# Group and calculate statistics
grouped = df_clean.groupby(entity_col, dropna=True)[metric_col].agg(['mean', 'count', 'std']).reset_index()
grouped = grouped.rename(columns={
'mean': f'avg_{metric_concept}',
'count': 'record_count',
'std': f'std_{metric_concept}'
})
# Sort by average metric (adjust based on whether higher or lower is better)
# For healthcare metrics like wait times, errors, costs - higher is typically worse
grouped = grouped.sort_values(f'avg_{metric_concept}', ascending=False)
grouped['rank'] = np.arange(1, len(grouped) + 1)
# Round numeric columns
numeric_cols = grouped.select_dtypes(include=[np.number]).columns
grouped[numeric_cols] = grouped[numeric_cols].round(1)
return grouped
def compute_comparative_analysis(reg: DataRegistry, mapping: MappingResult,
grouping_concept: str, metric_concept: str) -> Optional[pd.DataFrame]:
"""Generic function to compare healthcare metrics across different groups."""
df, group_col = _get(reg, mapping, grouping_concept)
if df is None or group_col is None:
return None
# Find metric column
metric_col = None
df_metric, mapped_metric_col = _get(reg, mapping, metric_concept)
if df_metric is not None and mapped_metric_col is not None and df_metric is df:
metric_col = mapped_metric_col
else:
metric_col = _find_best_metric_column(df, group_col)
if metric_col is None:
return None
# Clean data
df_clean = df[df[group_col].notna() & (df[group_col] != '')].copy()
df_clean[metric_col] = _clean_numeric_series(df_clean[metric_col])
df_clean = df_clean[df_clean[metric_col].notna()]
if df_clean.empty:
return None
# Group and analyze
grouped = df_clean.groupby(group_col, dropna=True)[metric_col].agg(['mean', 'count', 'std']).reset_index()
grouped = grouped.rename(columns={
'mean': f'avg_{metric_concept}',
'count': 'record_count',
'std': f'std_{metric_concept}'
})
# Calculate overall average for comparison
overall_avg = df_clean[metric_col].mean()
grouped['vs_overall_avg'] = (grouped[f'avg_{metric_concept}'] - overall_avg).round(1)
# Sort by average metric
grouped = grouped.sort_values(f'avg_{metric_concept}', ascending=False)
# Round numeric columns
numeric_cols = grouped.select_dtypes(include=[np.number]).columns
grouped[numeric_cols] = grouped[numeric_cols].round(1)
return grouped
def compute_capacity_metrics(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
"""Compute healthcare capacity-related metrics if available."""
capacity_concepts = [
'capacity', 'beds', 'staffed_beds', 'occupied_beds', 'available_beds',
'volume', 'throughput', 'utilization', 'occupancy',
'appointments', 'procedures', 'admissions', 'discharges',
'staffing', 'fte', 'personnel'
]
results = []
for concept in capacity_concepts:
df, col = _get(reg, mapping, concept)
if df is not None and col is not None:
clean_series = _clean_numeric_series(df[col])
if not clean_series.isna().all():
results.append({
'metric': f'{concept}_total',
'value': float(np.nansum(clean_series))
})
results.append({
'metric': f'{concept}_average',
'value': float(np.nanmean(clean_series))
})
results.append({
'metric': f'{concept}_records',
'value': int((~clean_series.isna()).sum())
})
if results:
return pd.DataFrame(results)
return None
def compute_cost_metrics(reg: DataRegistry, mapping: MappingResult) -> Optional[pd.DataFrame]:
"""Compute healthcare cost-related metrics if available."""
cost_concepts = [
'cost', 'price', 'expense', 'fee', 'charge', 'budget', 'funding',
'fixed_cost', 'variable_cost', 'operational_cost', 'capital_cost',
'reimbursement', 'revenue', 'billing', 'payment'
]
results = []
for concept in cost_concepts:
df, col = _get(reg, mapping, concept)
if df is not None and col is not None:
clean_series = _clean_numeric_series(df[col])
if not clean_series.isna().all():
results.append({
'component': f'{concept}_total',
'value': float(np.nansum(clean_series))
})
results.append({
'component': f'{concept}_average',
'value': float(np.nanmean(clean_series))
})
if results:
return pd.DataFrame(results)
return None
def auto_discover_healthcare_analysis_opportunities(reg: DataRegistry) -> Dict[str, List[str]]:
"""Automatically discover what healthcare analyses are possible with the available data."""
opportunities = {
'provider_rankings': [],
'service_comparisons': [],
'regional_analysis': [],
'outcome_metrics': [],
'efficiency_metrics': []
}
for table_name, df in reg._tables.items():
if df.empty:
continue
# Find potential healthcare grouping columns
categorical_cols = _detect_categorical_columns(df)
numeric_cols = _detect_numeric_columns(df)
# Healthcare-specific categorization
provider_cols = [col for col in categorical_cols if re.search(r'facility|hospital|clinic|provider', col.lower())]
service_cols = [col for col in categorical_cols if re.search(r'specialty|service|department|procedure', col.lower())]
regional_cols = [col for col in categorical_cols if re.search(r'zone|region|area|district', col.lower())]
outcome_cols = [col for col in numeric_cols if re.search(r'outcome|mortality|readmission|infection|complication', col.lower())]
efficiency_cols = [col for col in numeric_cols if re.search(r'wait|time|throughput|utilization|length_of_stay', col.lower())]
# Suggest healthcare-specific analyses
for provider_col in provider_cols[:2]:
for metric_col in (efficiency_cols + outcome_cols)[:2]:
opportunities['provider_rankings'].append(f"{provider_col} by {metric_col}")
for service_col in service_cols[:2]:
for metric_col in (efficiency_cols + outcome_cols)[:2]:
opportunities['service_comparisons'].append(f"{metric_col} across {service_col}")
for regional_col in regional_cols[:2]:
for metric_col in (efficiency_cols + outcome_cols)[:2]:
opportunities['regional_analysis'].append(f"{metric_col} by {regional_col}")
opportunities['outcome_metrics'].extend(outcome_cols[:3])
opportunities['efficiency_metrics'].extend(efficiency_cols[:3])
return opportunities
def build_data_findings_markdown(reg: DataRegistry, mapping: MappingResult, topn: int = 5):
"""Build generic healthcare data analysis report based on available data and mappings."""
missing: List[str] = []
sections = []
# Auto-discover healthcare analysis opportunities
opportunities = auto_discover_healthcare_analysis_opportunities(reg)
# Healthcare-specific analysis patterns
analysis_patterns = [
('provider rankings', ['facility', 'provider', 'hospital', 'clinic'], ['wait_time', 'wait_median', 'wait_days', 'wait_p90', 'cost', 'outcome']),
('service analysis', ['specialty', 'service', 'department', 'procedure', 'treatment'], ['wait_time', 'wait_median', 'wait_days', 'cost', 'outcome']),
('regional comparison', ['zone', 'region', 'area', 'district', 'network'], ['wait_time', 'wait_median', 'cost', 'outcome']),
('quality metrics', ['facility', 'service'], ['mortality', 'readmission', 'infection', 'complication', 'satisfaction']),
]
for analysis_name, entity_concepts, metric_concepts in analysis_patterns:
found_analysis = False
for entity_concept in entity_concepts:
for metric_concept in metric_concepts:
result = compute_generic_rankings(reg, mapping, entity_concept, metric_concept, analysis_name)
if result is not None and not result.empty:
sections.append(f"**Top {entity_concept.title()} by {metric_concept.replace('_', ' ').title()}**\n\n{_fmt_tbl(result.head(topn))}")
found_analysis = True
break
if found_analysis:
break
if not found_analysis:
missing.append(analysis_name)
# Healthcare-specific comparative analyses
comparison_patterns = [
('regional_performance', ['zone', 'region', 'area', 'district'], ['wait_time', 'wait_median', 'cost', 'outcome']),
('service_performance', ['specialty', 'service', 'department'], ['wait_time', 'wait_median', 'cost', 'outcome']),
('provider_comparison', ['facility', 'hospital', 'clinic'], ['efficiency', 'utilization', 'throughput']),
]
for analysis_name, group_concepts, metric_concepts in comparison_patterns:
found_analysis = False
for group_concept in group_concepts:
for metric_concept in metric_concepts:
result = compute_comparative_analysis(reg, mapping, group_concept, metric_concept)
if result is not None and not result.empty:
sections.append(f"**{group_concept.title()} Performance Comparison**\n\n{_fmt_tbl(result)}")
found_analysis = True
break
if found_analysis:
break
if not found_analysis:
missing.append(analysis_name)
# Healthcare capacity analysis
capacity = compute_capacity_metrics(reg, mapping)
if capacity is not None and not capacity.empty:
sections.append(f"**Healthcare Capacity Analysis**\n\n{_fmt_tbl(capacity)}")
else:
missing.append("capacity_analysis")
# Healthcare cost analysis
costs = compute_cost_metrics(reg, mapping)
if costs is not None and not costs.empty:
sections.append(f"**Healthcare Cost Analysis**\n\n{_fmt_tbl(costs)}")
else:
missing.append("cost_analysis")
# Build final healthcare report
if sections:
md = (
"### Healthcare Data Analysis Results\n\n" +
"\n\n".join(sections) +
"\n\n**Clinical Data Quality Notes**\n"
"- Analysis performed on available healthcare data columns\n"
"- Missing values and empty entries excluded from calculations\n"
"- Numeric values rounded to 1 decimal place for clinical relevance\n"
"- Rankings prioritize areas that may require clinical attention or resource allocation\n"
"- Record counts indicate data volume and statistical reliability\n"
)
else:
md = "### Healthcare Data Analysis Results\n\nNo analyzable healthcare patterns found in the provided data. Consider uploading data with healthcare facility, service, or outcome metrics."
return md, missing |