Rajan Sharma commited on
Commit
1134cbf
·
verified ·
1 Parent(s): 33bf7ab

Update healthcare_analysis.py

Browse files
Files changed (1) hide show
  1. healthcare_analysis.py +8 -927
healthcare_analysis.py CHANGED
@@ -1,932 +1,13 @@
1
  # healthcare_analysis.py
2
  import pandas as pd
3
- import numpy as np
4
- from typing import Dict, List, Any, Optional, Tuple
5
- import logging
6
- import re
7
-
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
 
11
  class HealthcareAnalyzer:
12
- def __init__(self, data_registry):
13
- self.data_registry = data_registry
14
- self.analysis_results = {}
15
- self.scenario_text = ""
16
-
17
- def comprehensive_analysis(self, scenario_text: str) -> Dict[str, Any]:
18
- """Perform comprehensive healthcare scenario analysis"""
19
- logger.info("Starting comprehensive healthcare analysis")
20
-
21
- self.scenario_text = scenario_text
22
-
23
- # Extract all requirements and tasks
24
- requirements = self._extract_all_requirements(scenario_text)
25
- tasks = self._extract_detailed_tasks(scenario_text)
26
-
27
- # Identify relevant datasets
28
- relevant_data = self._identify_relevant_data(scenario_text)
29
-
30
- # Perform all analyses based on tasks
31
- results = {
32
- "requirements": requirements,
33
- "tasks_completed": [],
34
- "data_sources": relevant_data
35
- }
36
-
37
- # Data Preparation Tasks
38
- if "data_preparation" in tasks:
39
- results["data_preparation"] = self.analyze_data_preparation(relevant_data, requirements)
40
- results["tasks_completed"].append("data_preparation")
41
-
42
- # Facility Distribution Analysis
43
- if "facility_distribution" in tasks:
44
- results["facility_distribution"] = self.analyze_facility_distribution(relevant_data, requirements)
45
- results["tasks_completed"].append("facility_distribution")
46
-
47
- # Capacity Analysis
48
- if "capacity_analysis" in tasks:
49
- results["capacity_analysis"] = self.analyze_capacity(relevant_data, requirements)
50
- results["tasks_completed"].append("capacity_analysis")
51
-
52
- # Long-Term Care Assessment (specific to scenario requirements)
53
- if "long_term_care_assessment" in tasks:
54
- results["long_term_care_assessment"] = self.analyze_long_term_care_capacity(results, requirements)
55
- results["tasks_completed"].append("long_term_care_assessment")
56
-
57
- # Resource Allocation Analysis
58
- if "resource_allocation" in tasks:
59
- results["resource_allocation"] = self.analyze_resource_allocation(relevant_data)
60
- results["tasks_completed"].append("resource_allocation")
61
-
62
- # Trends Analysis
63
- if "trends" in tasks:
64
- results["trends"] = self.analyze_trends(relevant_data)
65
- results["tasks_completed"].append("trends")
66
-
67
- # Generate recommendations
68
- if "operational_recommendations" in tasks:
69
- results["recommendations"] = self.generate_operational_recommendations(results, requirements)
70
- results["tasks_completed"].append("operational_recommendations")
71
-
72
- # Future Integration Opportunities
73
- if "future_integration" in tasks:
74
- results["future_integration"] = self.identify_integration_opportunities(results)
75
- results["tasks_completed"].append("future_integration")
76
-
77
- # Validate that all required tasks were completed
78
- validation_result = self.validate_analysis_completeness(tasks, results["tasks_completed"])
79
- results["validation"] = validation_result
80
-
81
- logger.info("Comprehensive analysis completed")
82
- return results
83
-
84
- def _extract_all_requirements(self, scenario_text: str) -> Dict[str, Any]:
85
- """Extract all specific requirements from scenario text"""
86
- requirements = {
87
- "geographic_scope": self._extract_geographic_scope(scenario_text),
88
- "time_period": self._extract_time_period(scenario_text),
89
- "facility_types": self._extract_facility_types(scenario_text),
90
- "metrics_needed": self._extract_metrics(scenario_text),
91
- "regions": self._extract_regions(scenario_text),
92
- "data_files": self._extract_data_files(scenario_text),
93
- "specific_questions": self._extract_specific_questions(scenario_text)
94
- }
95
- return requirements
96
-
97
- def _extract_detailed_tasks(self, scenario_text: str) -> List[str]:
98
- """Extract detailed tasks from scenario text"""
99
- tasks = []
100
- text_lower = scenario_text.lower()
101
-
102
- # Data preparation tasks
103
- if any(phrase in text_lower for phrase in ["load the data", "data preparation", "frequency table"]):
104
- tasks.append("data_preparation")
105
-
106
- # Facility distribution tasks
107
- if any(phrase in text_lower for phrase in ["facility distribution", "cities with highest", "facility type"]):
108
- tasks.append("facility_distribution")
109
-
110
- # Capacity analysis tasks
111
- if any(phrase in text_lower for phrase in ["bed capacity", "capacity analysis", "bed_change"]):
112
- tasks.append("capacity_analysis")
113
-
114
- # Long-term care assessment tasks
115
- if any(phrase in text_lower for phrase in ["long-term care", "long term care", "nursing care"]):
116
- tasks.append("long_term_care_assessment")
117
-
118
- # Resource allocation tasks
119
- if any(phrase in text_lower for phrase in ["resource allocation", "staffing", "equipment"]):
120
- tasks.append("resource_allocation")
121
-
122
- # Trends analysis tasks
123
- if any(phrase in text_lower for phrase in ["trends", "change", "growth", "decline"]):
124
- tasks.append("trends")
125
-
126
- # Operational recommendations tasks
127
- if any(phrase in text_lower for phrase in ["operational recommendations", "recommend actions", "mitigate shortages"]):
128
- tasks.append("operational_recommendations")
129
-
130
- # Future integration tasks
131
- if any(phrase in text_lower for phrase in ["future integration", "augmented ai", "decision-making"]):
132
- tasks.append("future_integration")
133
-
134
- return tasks
135
-
136
- def _extract_specific_questions(self, scenario_text: str) -> List[str]:
137
- """Extract specific questions from scenario text"""
138
- questions = []
139
-
140
- # Look for question patterns
141
- question_patterns = [
142
- r'which zone shows the largest',
143
- r'which zone has the largest',
144
- r'list the five',
145
- r'does this city have',
146
- r'provide the numbers to justify',
147
- r'propose at least',
148
- r'mention at least'
149
- ]
150
-
151
- for pattern in question_patterns:
152
- matches = re.findall(pattern, scenario_text, re.IGNORECASE)
153
- questions.extend(matches)
154
-
155
- return questions
156
-
157
- def _extract_data_files(self, scenario_text: str) -> List[str]:
158
- """Extract data file names from scenario text"""
159
- files = []
160
-
161
- # Look for file patterns
162
- file_patterns = [
163
- r'([a-zA-Z_]+\.csv)',
164
- r'([a-zA-Z_]+\.xlsx)',
165
- r'([a-zA-Z_]+\.json)'
166
- ]
167
-
168
- for pattern in file_patterns:
169
- matches = re.findall(pattern, scenario_text)
170
- files.extend(matches)
171
-
172
- return list(set(files)) # Remove duplicates
173
-
174
- def analyze_data_preparation(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
175
- """Enhanced data preparation analysis"""
176
- results = {}
177
- geographic_scope = requirements.get("geographic_scope", "Unknown")
178
- regions = requirements.get("regions", [])
179
-
180
- for data_name in relevant_data:
181
- df = self.data_registry.get(data_name)
182
- if df is None or df.empty:
183
- continue
184
-
185
- # Filter data based on geographic scope
186
- filtered_df = self._filter_by_geography(df, geographic_scope, regions)
187
-
188
- if filtered_df.empty:
189
- continue
190
-
191
- # Facility type frequency table
192
- type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
193
- if type_col:
194
- filtered_df[type_col] = filtered_df[type_col].astype(str)
195
- type_freq = filtered_df[type_col].value_counts().to_dict()
196
- results["facility_type_frequency"] = type_freq
197
-
198
- # Top cities analysis
199
- city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
200
- if city_col:
201
- filtered_df[city_col] = filtered_df[city_col].astype(str)
202
- city_counts = filtered_df[city_col].value_counts().head(5)
203
- top_cities = city_counts.index.tolist()
204
-
205
- # Breakdown by facility type for each top city
206
- city_breakdown = {}
207
- for city in top_cities:
208
- city_data = filtered_df[filtered_df[city_col] == city]
209
- if not city_data.empty and type_col in city_data.columns:
210
- city_breakdown[city] = city_data[type_col].value_counts().to_dict()
211
-
212
- results["top_cities"] = top_cities
213
- results["city_facility_breakdown"] = city_breakdown
214
-
215
- # Total facilities count
216
- results["total_facilities"] = len(filtered_df)
217
-
218
- return results
219
-
220
- def analyze_long_term_care_capacity(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> Dict[str, Any]:
221
- """Analyze long-term care capacity based on scenario requirements"""
222
- results = {}
223
-
224
- # Get the zone with the largest percentage decrease from capacity analysis
225
- if "capacity_analysis" in analysis_results:
226
- capacity_data = analysis_results["capacity_analysis"]
227
-
228
- # Find the zone with largest percentage decrease
229
- max_pct_decrease = capacity_data.get("max_percentage_decrease", {})
230
-
231
- # Extract zone name (try multiple possible keys)
232
- zone_name = None
233
- for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
234
- if key in max_pct_decrease:
235
- zone_name = max_pct_decrease[key]
236
- break
237
-
238
- if zone_name:
239
- results["zone_with_largest_decrease"] = zone_name
240
-
241
- # Get facility distribution data
242
- if "facility_distribution" in analysis_results:
243
- facility_data = analysis_results["facility_distribution"]
244
-
245
- # Find the major city in this zone
246
- major_city = self._find_major_city_in_zone(zone_name, facility_data, requirements)
247
-
248
- if major_city:
249
- results["major_city"] = major_city
250
-
251
- # Analyze long-term care capacity in this city
252
- city_breakdown = facility_data.get("city_facility_breakdown", {})
253
-
254
- if major_city in city_breakdown:
255
- facilities_in_city = city_breakdown[major_city]
256
-
257
- # Count different facility types
258
- hospitals = facilities_in_city.get("Hospitals", 0)
259
- nursing_care = facilities_in_city.get("Nursing and residential care facilities", 0)
260
- ambulatory = facilities_in_city.get("Ambulatory health care services", 0)
261
-
262
- results["facility_counts"] = {
263
- "hospitals": hospitals,
264
- "nursing_residential_care": nursing_care,
265
- "ambulatory": ambulatory
266
- }
267
-
268
- # Calculate ratio and assess sufficiency
269
- if hospitals > 0:
270
- ratio = nursing_care / hospitals
271
- results["nursing_to_hospital_ratio"] = ratio
272
-
273
- # Assess capacity
274
- if ratio >= 1.5:
275
- results["capacity_assessment"] = "sufficient"
276
- else:
277
- results["capacity_assessment"] = "insufficient"
278
- else:
279
- results["capacity_assessment"] = "insufficient (no hospitals)"
280
-
281
- return results
282
-
283
- def _find_major_city_in_zone(self, zone_name: str, facility_data: Dict[str, Any], requirements: Dict[str, Any]) -> Optional[str]:
284
- """Find the major city in a given zone"""
285
- # This is a simplified approach - in a real implementation, you would need
286
- # zone-to-city mapping data or more sophisticated geospatial analysis
287
-
288
- # For now, we'll use the city with the most facilities as the major city
289
- top_cities = facility_data.get("top_cities", [])
290
-
291
- if top_cities:
292
- # In a real implementation, you would check which city belongs to the zone
293
- # For now, we'll return the first city as a placeholder
294
- return top_cities[0]
295
-
296
- return None
297
-
298
- def generate_operational_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
299
- """Generate comprehensive operational recommendations"""
300
- recommendations = []
301
- geographic_scope = requirements.get("geographic_scope", "the region")
302
-
303
- # Capacity-related recommendations
304
- if "capacity_analysis" in analysis_results:
305
- capacity = analysis_results["capacity_analysis"]
306
-
307
- # Low utilization recommendations
308
- if "average_utilization" in capacity and capacity["average_utilization"] < 0.7:
309
- recommendations.append({
310
- "title": "Optimize Underutilized Capacity",
311
- "description": f"Average utilization is {capacity['average_utilization']:.1%} in {geographic_scope}. Consider repurposing underutilized facilities or consolidating services.",
312
- "priority": "Medium",
313
- "data_source": "Capacity utilization analysis"
314
- })
315
-
316
- # Capacity growth recommendations
317
- if "capacity_growth_rate" in capacity and capacity["capacity_growth_rate"] < 2:
318
- recommendations.append({
319
- "title": "Expand Capacity Strategically",
320
- "description": f"Capacity growth rate is only {capacity['capacity_growth_rate']:.1f}% in {geographic_scope}. Invest in new facilities or expand existing ones to meet demand.",
321
- "priority": "High",
322
- "data_source": "Capacity trend analysis"
323
- })
324
-
325
- # Zone-specific recommendations
326
- if "max_percentage_decrease" in capacity and isinstance(capacity["max_percentage_decrease"], dict):
327
- zone_name = "a zone"
328
- for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
329
- if key in capacity["max_percentage_decrease"]:
330
- zone_name = capacity["max_percentage_decrease"][key]
331
- break
332
-
333
- decrease = capacity["max_percentage_decrease"].get("percent_change", 0)
334
-
335
- if zone_name and decrease:
336
- recommendations.append({
337
- "title": f"Address Capacity Decline in {zone_name}",
338
- "description": f"{zone_name} shows a {decrease:.1f}% decrease in bed capacity. Investigate causes and implement recovery strategies.",
339
- "priority": "High",
340
- "data_source": "Zone capacity analysis"
341
- })
342
-
343
- # Long-term care recommendations
344
- if "long_term_care_assessment" in analysis_results:
345
- ltc_data = analysis_results["long_term_care_assessment"]
346
-
347
- if ltc_data.get("capacity_assessment") == "insufficient":
348
- major_city = ltc_data.get("major_city", "the major city")
349
- ratio = ltc_data.get("nursing_to_hospital_ratio", 0)
350
-
351
- recommendations.append({
352
- "title": f"Expand Long-Term Care Capacity in {major_city}",
353
- "description": f"Nursing/residential care to hospital ratio is {ratio:.2f} in {major_city}, which is insufficient. Invest in new long-term care beds or repurpose existing facilities.",
354
- "priority": "High",
355
- "data_source": "Long-term care capacity assessment"
356
- })
357
-
358
- # Resource allocation recommendations
359
- if "resource_allocation" in analysis_results:
360
- resources = analysis_results["resource_allocation"]
361
-
362
- if "staff_per_bed_ratio" in resources and resources["staff_per_bed_ratio"] < 1.5:
363
- recommendations.append({
364
- "title": "Increase Staffing Levels",
365
- "description": f"Staff per bed ratio is {resources['staff_per_bed_ratio']:.2f} in {geographic_scope}, which may be insufficient. Consider hiring additional staff.",
366
- "priority": "High",
367
- "data_source": "Resource allocation analysis"
368
- })
369
-
370
- # Ensure we have at least 3 recommendations as required
371
- while len(recommendations) < 3:
372
- recommendations.append({
373
- "title": "Implement Comprehensive Capacity Management",
374
- "description": "Develop a comprehensive capacity management system that includes real-time monitoring, predictive analytics, and dynamic resource allocation.",
375
- "priority": "Medium",
376
- "data_source": "General best practices"
377
- })
378
-
379
- # Sort by priority
380
- priority_order = {"High": 0, "Medium": 1, "Low": 2}
381
- recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3))
382
-
383
- return recommendations
384
-
385
- def validate_analysis_completeness(self, required_tasks: List[str], completed_tasks: List[str]) -> Dict[str, Any]:
386
- """Validate that all required tasks were completed"""
387
- validation = {
388
- "all_tasks_completed": True,
389
- "missing_tasks": [],
390
- "completion_rate": len(completed_tasks) / len(required_tasks) if required_tasks else 0
391
- }
392
-
393
- for task in required_tasks:
394
- if task not in completed_tasks:
395
- validation["all_tasks_completed"] = False
396
- validation["missing_tasks"].append(task)
397
-
398
- return validation
399
-
400
- def analyze_facility_distribution(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
401
- """Enhanced facility distribution analysis"""
402
- results = {}
403
- geographic_scope = requirements.get("geographic_scope", "Unknown")
404
- regions = requirements.get("regions", [])
405
-
406
- for data_name in relevant_data:
407
- df = self.data_registry.get(data_name)
408
- if df is None or df.empty:
409
- continue
410
-
411
- # Filter data based on geographic scope
412
- filtered_df = self._filter_by_geography(df, geographic_scope, regions)
413
-
414
- if filtered_df.empty:
415
- continue
416
-
417
- # Facility type distribution
418
- type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
419
- if type_col:
420
- # Ensure we're working with string data
421
- filtered_df[type_col] = filtered_df[type_col].astype(str)
422
- type_dist = filtered_df[type_col].value_counts().to_dict()
423
- results["facility_type_distribution"] = type_dist
424
-
425
- # Calculate diversity index
426
- diversity = self._calculate_diversity_index(type_dist)
427
- results["facility_diversity"] = diversity
428
-
429
- # Geographic distribution
430
- geo_col = self._find_column(filtered_df, ['province', 'state', 'region', 'zone', 'area'])
431
- if geo_col:
432
- # Ensure we're working with string data
433
- filtered_df[geo_col] = filtered_df[geo_col].astype(str)
434
- geo_dist = filtered_df[geo_col].value_counts().to_dict()
435
- results["geographic_distribution"] = geo_dist
436
-
437
- # Calculate Gini coefficient for inequality
438
- gini = self._calculate_gini(list(geo_dist.values()))
439
- results["geographic_inequality"] = gini
440
-
441
- # City distribution
442
- city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
443
- if city_col:
444
- # Ensure we're working with string data
445
- filtered_df[city_col] = filtered_df[city_col].astype(str)
446
- city_counts = filtered_df[city_col].value_counts().head(5)
447
- top_cities = city_counts.index.tolist()
448
-
449
- # Breakdown by facility type for top cities
450
- city_breakdown = {}
451
- for city in top_cities:
452
- city_data = filtered_df[filtered_df[city_col] == city]
453
- if not city_data.empty and type_col in city_data.columns:
454
- city_breakdown[city] = city_data[type_col].value_counts().to_dict()
455
-
456
- results["top_cities"] = top_cities
457
- results["city_breakdown"] = city_breakdown
458
-
459
- # Total facilities count
460
- results["total_facilities"] = len(filtered_df)
461
-
462
- return results
463
-
464
- def analyze_capacity(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
465
- """Enhanced capacity analysis"""
466
- results = {}
467
- geographic_scope = requirements.get("geographic_scope", "Unknown")
468
- regions = requirements.get("regions", [])
469
-
470
- for data_name in relevant_data:
471
- df = self.data_registry.get(data_name)
472
- if df is None or df.empty:
473
- continue
474
-
475
- # Filter data based on geographic scope
476
- filtered_df = self._filter_by_geography(df, geographic_scope, regions)
477
-
478
- if filtered_df.empty:
479
- continue
480
-
481
- # Current capacity
482
- capacity_col = self._find_column(filtered_df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
483
- if capacity_col:
484
- # Ensure we're working with numeric data
485
- filtered_df[capacity_col] = pd.to_numeric(filtered_df[capacity_col], errors='coerce')
486
- total_capacity = filtered_df[capacity_col].sum()
487
- results["total_capacity"] = total_capacity
488
-
489
- # Capacity by facility type
490
- type_col = self._find_column(filtered_df, ['type', 'facility_type'])
491
- if type_col and type_col in filtered_df.columns:
492
- capacity_by_type = filtered_df.groupby(type_col)[capacity_col].sum().to_dict()
493
- results["capacity_by_type"] = capacity_by_type
494
-
495
- # Capacity utilization
496
- utilization_col = self._find_column(filtered_df, ['utilization', 'occupancy', 'occupancy_rate'])
497
- if utilization_col:
498
- # Ensure we're working with numeric data
499
- filtered_df[utilization_col] = pd.to_numeric(filtered_df[utilization_col], errors='coerce')
500
- avg_utilization = filtered_df[utilization_col].mean()
501
- results["average_utilization"] = avg_utilization
502
-
503
- # Utilization by facility type
504
- if type_col and type_col in filtered_df.columns:
505
- utilization_by_type = filtered_df.groupby(type_col)[utilization_col].mean().to_dict()
506
- results["utilization_by_type"] = utilization_by_type
507
-
508
- # Capacity trends
509
- time_cols = [col for col in filtered_df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
510
- if len(time_cols) >= 2:
511
- trend_data = {}
512
- for col in time_cols:
513
- # Ensure we're working with numeric data
514
- filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')
515
- trend_data[col] = filtered_df[col].sum()
516
- results["capacity_trends"] = trend_data
517
-
518
- # Calculate growth rate
519
- if len(time_cols) >= 2:
520
- latest = time_cols[-1]
521
- earliest = time_cols[0]
522
- if trend_data[earliest] > 0: # Avoid division by zero
523
- growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
524
- results["capacity_growth_rate"] = growth_rate
525
-
526
- # Bed change analysis
527
- prev_col = self._find_column(filtered_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
528
- current_col = self._find_column(filtered_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
529
-
530
- if prev_col and current_col:
531
- # Ensure we're working with numeric data
532
- filtered_df[prev_col] = pd.to_numeric(filtered_df[prev_col], errors='coerce')
533
- filtered_df[current_col] = pd.to_numeric(filtered_df[current_col], errors='coerce')
534
-
535
- # Calculate bed change
536
- filtered_df['bed_change'] = filtered_df[current_col] - filtered_df[prev_col]
537
-
538
- # Calculate percentage change
539
- filtered_df['percent_change'] = filtered_df.apply(
540
- lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
541
- axis=1
542
- )
543
-
544
- # Zone/Region-level analysis
545
- zone_col = self._find_column(filtered_df, ['zone', 'region', 'area', 'district'])
546
- if zone_col:
547
- # Ensure we're working with string data
548
- filtered_df[zone_col] = filtered_df[zone_col].astype(str)
549
-
550
- zone_summary = filtered_df.groupby(zone_col).agg({
551
- current_col: 'sum',
552
- prev_col: 'sum',
553
- 'bed_change': 'sum'
554
- }).reset_index()
555
-
556
- zone_summary['percent_change'] = zone_summary.apply(
557
- lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
558
- axis=1
559
- )
560
-
561
- results["zone_summary"] = zone_summary.to_dict('records')
562
-
563
- # Find zones with largest changes
564
- if not zone_summary.empty:
565
- # Get zone with largest absolute decrease
566
- if zone_summary['bed_change'].notna().any():
567
- max_abs_decrease_idx = zone_summary['bed_change'].idxmin()
568
- max_abs_decrease = zone_summary.loc[max_abs_decrease_idx]
569
- results["max_absolute_decrease"] = max_abs_decrease.to_dict()
570
-
571
- # Get zone with largest percentage decrease
572
- if zone_summary['percent_change'].notna().any():
573
- max_pct_decrease_idx = zone_summary['percent_change'].idxmin()
574
- max_pct_decrease = zone_summary.loc[max_pct_decrease_idx]
575
- results["max_percentage_decrease"] = max_pct_decrease.to_dict()
576
-
577
- # Identify facilities with largest declines
578
- facilities_decline = filtered_df.sort_values('bed_change').head(5)
579
- if not facilities_decline.empty:
580
- results["facilities_with_largest_declines"] = facilities_decline.to_dict('records')
581
-
582
- return results
583
-
584
- def _filter_by_geography(self, df: pd.DataFrame, geographic_scope: str, regions: List[str]) -> pd.DataFrame:
585
- """Filter dataframe based on geographic scope and regions"""
586
- if geographic_scope == "Unknown" and not regions:
587
- return df.copy()
588
-
589
- # Try to find a geographic column
590
- geo_col = self._find_column(df, ['province', 'state', 'region', 'zone', 'area', 'district'])
591
-
592
- if geo_col is None:
593
- return df.copy()
594
-
595
- # Ensure we're working with string data
596
- try:
597
- df[geo_col] = df[geo_col].astype(str)
598
- except Exception as e:
599
- logger.warning(f"Error converting column {geo_col} to string: {str(e)}")
600
- return df.copy()
601
-
602
- # Create filters
603
- filters = []
604
-
605
- # Add geographic scope filter
606
- if geographic_scope != "Unknown":
607
- # Create a list of possible values for the geographic scope
608
- scope_values = [geographic_scope.lower()]
609
-
610
- # Add common abbreviations
611
- abbreviations = {
612
- # Canadian provinces
613
- "alberta": "ab", "british columbia": "bc", "ontario": "on", "quebec": "qc",
614
- "manitoba": "mb", "saskatchewan": "sk", "nova scotia": "ns", "new brunswick": "nb",
615
- "prince edward island": "pe", "newfoundland": "nl", "yukon": "yt",
616
- "northwest territories": "nt", "nunavut": "nu",
617
- # US states
618
- "alabama": "al", "alaska": "ak", "arizona": "az", "arkansas": "ar",
619
- "california": "ca", "colorado": "co", "connecticut": "ct", "delaware": "de",
620
- "florida": "fl", "georgia": "ga", "hawaii": "hi", "idaho": "id",
621
- "illinois": "il", "indiana": "in", "iowa": "ia", "kansas": "ks",
622
- "kentucky": "ky", "louisiana": "la", "maine": "me", "maryland": "md",
623
- "massachusetts": "ma", "michigan": "mi", "minnesota": "mn", "mississippi": "ms",
624
- "missouri": "mo", "montana": "mt", "nebraska": "ne", "nevada": "nv",
625
- "new hampshire": "nh", "new jersey": "nj", "new mexico": "nm", "new york": "ny",
626
- "north carolina": "nc", "north dakota": "nd", "ohio": "oh", "oklahoma": "ok",
627
- "oregon": "or", "pennsylvania": "pa", "rhode island": "ri", "south carolina": "sc",
628
- "south dakota": "sd", "tennessee": "tn", "texas": "tx", "utah": "ut",
629
- "vermont": "vt", "virginia": "va", "washington": "wa", "west virginia": "wv",
630
- "wisconsin": "wi", "wyoming": "wy"
631
- }
632
-
633
- if geographic_scope.lower() in abbreviations:
634
- scope_values.append(abbreviations[geographic_scope.lower()])
635
-
636
- try:
637
- scope_filter = df[geo_col].str.lower().isin(scope_values)
638
- filters.append(scope_filter)
639
- except Exception as e:
640
- logger.warning(f"Error creating scope filter: {str(e)}")
641
-
642
- # Add region filters
643
- if regions:
644
- try:
645
- region_filter = df[geo_col].str.lower().isin([r.lower() for r in regions])
646
- filters.append(region_filter)
647
- except Exception as e:
648
- logger.warning(f"Error creating region filter: {str(e)}")
649
-
650
- # Apply filters
651
- if filters:
652
- try:
653
- combined_filter = filters[0]
654
- for f in filters[1:]:
655
- combined_filter = combined_filter | f
656
-
657
- return df[combined_filter].copy()
658
- except Exception as e:
659
- logger.warning(f"Error applying filters: {str(e)}")
660
-
661
- return df.copy()
662
-
663
- def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
664
- """Analyze resource allocation patterns"""
665
- results = {}
666
-
667
- for data_name in relevant_data:
668
- df = self.data_registry.get(data_name)
669
- if df is None or df.empty:
670
- continue
671
-
672
- # Staff analysis
673
- staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
674
- if staff_col:
675
- # Ensure we're working with numeric data
676
- df[staff_col] = pd.to_numeric(df[staff_col], errors='coerce')
677
- total_staff = df[staff_col].sum()
678
- results["total_staff"] = total_staff
679
-
680
- # Staff per bed ratio
681
- capacity_col = self._find_column(df, ['capacity', 'beds'])
682
- if capacity_col and capacity_col in df.columns:
683
- # Ensure we're working with numeric data
684
- df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
685
- df['staff_per_bed'] = df[staff_col] / df[capacity_col].replace(0, np.nan) # Avoid division by zero
686
- avg_staff_per_bed = df['staff_per_bed'].mean()
687
- results["staff_per_bed_ratio"] = avg_staff_per_bed
688
-
689
- # Equipment analysis
690
- equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
691
- if equipment_cols:
692
- equipment_summary = {}
693
- for col in equipment_cols:
694
- # Ensure we're working with numeric data
695
- df[col] = pd.to_numeric(df[col], errors='coerce')
696
- equipment_summary[col] = df[col].sum()
697
- results["equipment_summary"] = equipment_summary
698
-
699
- return results
700
-
701
- def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
702
- """Analyze trends in healthcare data"""
703
- results = {}
704
-
705
- for data_name in relevant_data:
706
- df = self.data_registry.get(data_name)
707
- if df is None or df.empty:
708
- continue
709
-
710
- # Find time-based columns
711
- time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
712
-
713
- if len(time_cols) >= 2:
714
- trends = {}
715
-
716
- # Calculate year-over-year changes
717
- for i in range(1, len(time_cols)):
718
- prev_year = time_cols[i-1]
719
- curr_year = time_cols[i]
720
-
721
- # Ensure we're working with numeric data
722
- df[prev_year] = pd.to_numeric(df[prev_year], errors='coerce')
723
- df[curr_year] = pd.to_numeric(df[curr_year], errors='coerce')
724
-
725
- prev_total = df[prev_year].sum()
726
- curr_total = df[curr_year].sum()
727
-
728
- if prev_total > 0: # Avoid division by zero
729
- change_pct = (curr_total - prev_total) / prev_total * 100
730
- trends[f"{prev_year}_to_{curr_year}"] = {
731
- "absolute_change": curr_total - prev_total,
732
- "percentage_change": change_pct
733
- }
734
-
735
- results["year_over_year_trends"] = trends
736
-
737
  return results
738
-
739
- def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
740
- """Identify opportunities for AI integration and data enhancement"""
741
- opportunities = {
742
- "data_integration": [],
743
- "ai_applications": [],
744
- "enhanced_metrics": []
745
- }
746
-
747
- # Data integration opportunities
748
- opportunities["data_integration"].append({
749
- "opportunity": "Integrate real-time occupancy data",
750
- "description": "Combine current facility data with real-time occupancy monitoring systems",
751
- "benefit": "Enable dynamic resource allocation and surge planning"
752
- })
753
-
754
- opportunities["data_integration"].append({
755
- "opportunity": "Incorporate demographic data",
756
- "description": "Add population demographics and health needs data",
757
- "benefit": "Improve demand forecasting and service planning"
758
- })
759
-
760
- # AI application opportunities
761
- opportunities["ai_applications"].append({
762
- "opportunity": "Predictive capacity modeling",
763
- "description": "Use ML to forecast capacity needs based on trends and external factors",
764
- "benefit": "Proactive resource planning and reduced wait times"
765
- })
766
-
767
- opportunities["ai_applications"].append({
768
- "opportunity": "Optimization algorithms",
769
- "description": "Implement AI for staff scheduling and resource allocation",
770
- "benefit": "Improved efficiency and reduced operational costs"
771
- })
772
-
773
- # Enhanced metrics
774
- opportunities["enhanced_metrics"].append({
775
- "metric": "Patient flow efficiency",
776
- "description": "Measure time from admission to discharge across facilities",
777
- "benefit": "Identify bottlenecks and improve patient experience"
778
- })
779
-
780
- opportunities["enhanced_metrics"].append({
781
- "metric": "Resource utilization index",
782
- "description": "Composite metric combining staff, equipment, and space utilization",
783
- "benefit": "Holistic view of operational efficiency"
784
- })
785
-
786
- return opportunities
787
-
788
- # Helper methods
789
- def _find_column(self, df, patterns):
790
- """Find the first column matching any pattern"""
791
- if df is None or df.empty:
792
- return None
793
- for col in df.columns:
794
- if any(pattern.lower() in col.lower() for pattern in patterns):
795
- return col
796
- return None
797
-
798
- def _calculate_gini(self, values):
799
- """Calculate Gini coefficient for inequality measurement"""
800
- if not values or len(values) < 2:
801
- return 0
802
-
803
- values = sorted(values)
804
- n = len(values)
805
- index = np.arange(1, n + 1)
806
- total = np.sum(values)
807
-
808
- if total == 0:
809
- return 0
810
-
811
- gini = (np.sum((2 * index - n - 1) * values)) / (n * total)
812
- return gini
813
-
814
- def _calculate_diversity_index(self, distribution):
815
- """Calculate Shannon diversity index"""
816
- if not distribution:
817
- return 0
818
-
819
- total = sum(distribution.values())
820
- if total == 0:
821
- return 0
822
-
823
- proportions = [count/total for count in distribution.values() if count > 0]
824
- if not proportions:
825
- return 0
826
-
827
- return -sum(p * np.log(p) for p in proportions)
828
-
829
- def _extract_geographic_scope(self, text):
830
- """Extract geographic scope from text"""
831
- # Look for province/state names
832
- provinces = [
833
- "alberta", "british columbia", "ontario", "quebec", "manitoba",
834
- "saskatchewan", "nova scotia", "new brunswick", "prince edward island",
835
- "newfoundland", "yukon", "northwest territories", "nunavut"
836
- ]
837
-
838
- states = [
839
- "alabama", "alaska", "arizona", "arkansas", "california", "colorado",
840
- "connecticut", "delaware", "florida", "georgia", "hawaii", "idaho",
841
- "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana",
842
- "maine", "maryland", "massachusetts", "michigan", "minnesota",
843
- "mississippi", "missouri", "montana", "nebraska", "nevada",
844
- "new hampshire", "new jersey", "new mexico", "new york",
845
- "north carolina", "north dakota", "ohio", "oklahoma", "oregon",
846
- "pennsylvania", "rhode island", "south carolina", "south dakota",
847
- "tennessee", "texas", "utah", "vermont", "virginia", "washington",
848
- "west virginia", "wisconsin", "wyoming"
849
- ]
850
-
851
- text_lower = text.lower()
852
-
853
- # Check for provinces
854
- for province in provinces:
855
- if province in text_lower:
856
- return province.title()
857
-
858
- # Check for states
859
- for state in states:
860
- if state in text_lower:
861
- return state.title()
862
-
863
- # Check for countries
864
- if "canada" in text_lower:
865
- return "Canada"
866
- if "usa" in text_lower or "united states" in text_lower:
867
- return "United States"
868
-
869
- return "Unknown"
870
-
871
- def _extract_time_period(self, text):
872
- """Extract time period from text"""
873
- # Look for year patterns
874
- years = re.findall(r'\b(20\d{2})\b', text)
875
- if len(years) >= 2:
876
- return f"{min(years)}-{max(years)}"
877
- return "Unknown"
878
-
879
- def _extract_facility_types(self, text):
880
- """Extract facility types from text"""
881
- types = []
882
- if "hospital" in text.lower():
883
- types.append("Hospitals")
884
- if "nursing" in text.lower() or "long-term" in text.lower():
885
- types.append("Nursing homes")
886
- if "clinic" in text.lower():
887
- types.append("Clinics")
888
- return types
889
-
890
- def _extract_metrics(self, text):
891
- """Extract required metrics from text"""
892
- metrics = []
893
- if "bed" in text.lower():
894
- metrics.append("Bed capacity")
895
- if "occupancy" in text.lower():
896
- metrics.append("Occupancy rates")
897
- if "staff" in text.lower():
898
- metrics.append("Staffing levels")
899
- return metrics
900
-
901
- def _extract_regions(self, text):
902
- """Extract specific regions mentioned in the scenario"""
903
- # Look for region names in the scenario
904
- regions = []
905
-
906
- # Common region patterns - this could be expanded
907
- region_patterns = [
908
- r'([A-Z][a-z]+ (Zone|Region|Area|District))',
909
- r'(North|South|East|West|Central)',
910
- r'([A-Z][a-z]+ (City|County|State|Province))',
911
- r'([A-Z][a-z]+)'
912
- ]
913
-
914
- for pattern in region_patterns:
915
- matches = re.findall(pattern, text)
916
- for match in matches:
917
- if isinstance(match, tuple):
918
- regions.append(match[0])
919
- else:
920
- regions.append(match)
921
-
922
- # Remove duplicates while preserving order
923
- seen = set()
924
- unique_regions = [r for r in regions if not (r in seen or seen.add(r))]
925
-
926
- return unique_regions
927
-
928
- def _identify_relevant_data(self, text):
929
- """Identify relevant datasets for the scenario"""
930
- # Use data registry's find_related_datasets method
931
- keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
932
- return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]
 
1
  # healthcare_analysis.py
2
  import pandas as pd
3
+ from data_registry import DataRegistry
 
 
 
 
 
 
4
 
5
  class HealthcareAnalyzer:
6
+ def __init__(self, registry: DataRegistry):
7
+ self.registry = registry
8
+
9
+ def comprehensive_analysis(self, scenario: str) -> dict:
10
+ results={}
11
+ for name in self.registry.names():
12
+ results[name]=self.registry.get(name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  return results