Rajan Sharma commited on
Commit
340e032
·
verified ·
1 Parent(s): 6c051db

Update healthcare_analysis.py

Browse files
Files changed (1) hide show
  1. healthcare_analysis.py +87 -62
healthcare_analysis.py CHANGED
@@ -82,11 +82,12 @@ class HealthcareAnalyzer:
82
  # Look for region names in the scenario
83
  regions = []
84
 
85
- # Common region patterns - this could be expanded
86
  region_patterns = [
87
  r'([A-Z][a-z]+ (Zone|Region|Area|District))',
88
- r'(North|South|East|West|Central|Calgary|Edmonton|Toronto|Vancouver|Montreal)',
89
- r'(Alberta|British Columbia|Ontario|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|PEI|Newfoundland|Yukon|NWT|Nunavut)'
 
90
  ]
91
 
92
  import re
@@ -115,13 +116,13 @@ class HealthcareAnalyzer:
115
  if df is None or df.empty:
116
  continue
117
 
118
- # Filter data based on geographic scope
119
  filtered_df = self._filter_by_geography(df, geographic_scope, regions)
120
 
121
  if filtered_df.empty:
122
  continue
123
 
124
- # Facility type distribution
125
  type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
126
  if type_col:
127
  # Ensure we're working with string data
@@ -133,7 +134,7 @@ class HealthcareAnalyzer:
133
  diversity = self._calculate_diversity_index(type_dist)
134
  results["facility_diversity"] = diversity
135
 
136
- # Geographic distribution
137
  geo_col = self._find_column(filtered_df, ['province', 'state', 'region', 'zone', 'area'])
138
  if geo_col:
139
  # Ensure we're working with string data
@@ -145,7 +146,7 @@ class HealthcareAnalyzer:
145
  gini = self._calculate_gini(list(geo_dist.values()))
146
  results["geographic_inequality"] = gini
147
 
148
- # City distribution
149
  city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
150
  if city_col:
151
  # Ensure we're working with string data
@@ -179,13 +180,13 @@ class HealthcareAnalyzer:
179
  if df is None or df.empty:
180
  continue
181
 
182
- # Filter data based on geographic scope
183
  filtered_df = self._filter_by_geography(df, geographic_scope, regions)
184
 
185
  if filtered_df.empty:
186
  continue
187
 
188
- # Current capacity
189
  capacity_col = self._find_column(filtered_df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
190
  if capacity_col:
191
  # Ensure we're working with numeric data
@@ -212,7 +213,7 @@ class HealthcareAnalyzer:
212
  utilization_by_type = filtered_df.groupby(type_col)[utilization_col].mean().to_dict()
213
  results["utilization_by_type"] = utilization_by_type
214
 
215
- # Capacity trends
216
  time_cols = [col for col in filtered_df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
217
  if len(time_cols) >= 2:
218
  trend_data = {}
@@ -230,7 +231,7 @@ class HealthcareAnalyzer:
230
  growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
231
  results["capacity_growth_rate"] = growth_rate
232
 
233
- # Bed change analysis
234
  prev_col = self._find_column(filtered_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
235
  current_col = self._find_column(filtered_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
236
 
@@ -248,7 +249,7 @@ class HealthcareAnalyzer:
248
  axis=1
249
  )
250
 
251
- # Zone/Region-level analysis
252
  zone_col = self._find_column(filtered_df, ['zone', 'region', 'area', 'district'])
253
  if zone_col:
254
  # Ensure we're working with string data
@@ -289,67 +290,86 @@ class HealthcareAnalyzer:
289
  return results
290
 
291
  def _filter_by_geography(self, df: pd.DataFrame, geographic_scope: str, regions: List[str]) -> pd.DataFrame:
292
- """Filter dataframe based on geographic scope and regions"""
293
  if geographic_scope == "Unknown" and not regions:
294
  return df.copy()
295
 
296
- # Try to find a geographic column
297
  geo_col = self._find_column(df, ['province', 'state', 'region', 'zone', 'area', 'district'])
298
 
299
  if geo_col is None:
300
  return df.copy()
301
 
302
  # Ensure we're working with string data
303
- df[geo_col] = df[geo_col].astype(str)
 
 
 
 
304
 
305
  # Create filters
306
  filters = []
307
 
308
- # Add geographic scope filter
309
  if geographic_scope != "Unknown":
310
  # Create a list of possible values for the geographic scope
311
  scope_values = [geographic_scope.lower()]
312
 
313
- # Add common abbreviations
314
  abbreviations = {
315
- "alberta": "ab",
316
- "british columbia": "bc",
317
- "ontario": "on",
318
- "quebec": "qc",
319
- "manitoba": "mb",
320
- "saskatchewan": "sk",
321
- "nova scotia": "ns",
322
- "new brunswick": "nb",
323
- "prince edward island": "pe",
324
- "newfoundland": "nl",
325
- "yukon": "yt",
326
- "northwest territories": "nt",
327
- "nunavut": "nu"
 
 
 
 
 
 
328
  }
329
 
330
  if geographic_scope.lower() in abbreviations:
331
  scope_values.append(abbreviations[geographic_scope.lower()])
332
 
333
- scope_filter = df[geo_col].str.lower().isin(scope_values)
334
- filters.append(scope_filter)
 
 
 
335
 
336
- # Add region filters
337
  if regions:
338
- region_filter = df[geo_col].str.lower().isin([r.lower() for r in regions])
339
- filters.append(region_filter)
 
 
 
340
 
341
  # Apply filters
342
  if filters:
343
- combined_filter = filters[0]
344
- for f in filters[1:]:
345
- combined_filter = combined_filter | f
346
-
347
- return df[combined_filter].copy()
 
 
 
348
 
349
  return df.copy()
350
 
351
  def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
352
- """Analyze resource allocation patterns"""
353
  results = {}
354
 
355
  for data_name in relevant_data:
@@ -357,7 +377,7 @@ class HealthcareAnalyzer:
357
  if df is None or df.empty:
358
  continue
359
 
360
- # Staff analysis
361
  staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
362
  if staff_col:
363
  # Ensure we're working with numeric data
@@ -374,7 +394,7 @@ class HealthcareAnalyzer:
374
  avg_staff_per_bed = df['staff_per_bed'].mean()
375
  results["staff_per_bed_ratio"] = avg_staff_per_bed
376
 
377
- # Equipment analysis
378
  equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
379
  if equipment_cols:
380
  equipment_summary = {}
@@ -387,7 +407,7 @@ class HealthcareAnalyzer:
387
  return results
388
 
389
  def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
390
- """Analyze trends in healthcare data"""
391
  results = {}
392
 
393
  for data_name in relevant_data:
@@ -395,7 +415,7 @@ class HealthcareAnalyzer:
395
  if df is None or df.empty:
396
  continue
397
 
398
- # Find time-based columns
399
  time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
400
 
401
  if len(time_cols) >= 2:
@@ -425,7 +445,7 @@ class HealthcareAnalyzer:
425
  return results
426
 
427
  def generate_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
428
- """Generate data-driven operational recommendations"""
429
  recommendations = []
430
  geographic_scope = requirements.get("geographic_scope", "the region")
431
 
@@ -451,16 +471,21 @@ class HealthcareAnalyzer:
451
  "data_source": "Capacity trend analysis"
452
  })
453
 
454
- # Zone-specific recommendations
455
  if "max_percentage_decrease" in capacity and isinstance(capacity["max_percentage_decrease"], dict):
456
- zone_col = capacity.get("columns_used", {}).get("zone")
457
- zone = capacity["max_percentage_decrease"].get(zone_col, 'a zone') if zone_col else 'a zone'
 
 
 
 
 
458
  decrease = capacity["max_percentage_decrease"].get("percent_change", 0)
459
 
460
- if zone and decrease:
461
  recommendations.append({
462
- "title": f"Address Capacity Decline in {zone}",
463
- "description": f"{zone} shows a {decrease:.1f}% decrease in bed capacity. Investigate causes and implement recovery strategies.",
464
  "priority": "High",
465
  "data_source": "Zone capacity analysis"
466
  })
@@ -496,7 +521,7 @@ class HealthcareAnalyzer:
496
  return recommendations
497
 
498
  def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
499
- """Identify opportunities for AI integration and data enhancement"""
500
  opportunities = {
501
  "data_integration": [],
502
  "ai_applications": [],
@@ -546,7 +571,7 @@ class HealthcareAnalyzer:
546
 
547
  # Helper methods
548
  def _find_column(self, df, patterns):
549
- """Find the first column matching any pattern"""
550
  if df is None or df.empty:
551
  return None
552
  for col in df.columns:
@@ -555,7 +580,7 @@ class HealthcareAnalyzer:
555
  return None
556
 
557
  def _calculate_gini(self, values):
558
- """Calculate Gini coefficient for inequality measurement"""
559
  if not values or len(values) < 2:
560
  return 0
561
 
@@ -571,7 +596,7 @@ class HealthcareAnalyzer:
571
  return gini
572
 
573
  def _calculate_diversity_index(self, distribution):
574
- """Calculate Shannon diversity index"""
575
  if not distribution:
576
  return 0
577
 
@@ -586,8 +611,8 @@ class HealthcareAnalyzer:
586
  return -sum(p * np.log(p) for p in proportions)
587
 
588
  def _extract_geographic_scope(self, text):
589
- """Extract geographic scope from text"""
590
- # Look for province/state names
591
  provinces = [
592
  "alberta", "british columbia", "ontario", "quebec", "manitoba",
593
  "saskatchewan", "nova scotia", "new brunswick", "prince edward island",
@@ -628,7 +653,7 @@ class HealthcareAnalyzer:
628
  return "Unknown"
629
 
630
  def _extract_time_period(self, text):
631
- """Extract time period from text"""
632
  # Look for year patterns
633
  import re
634
  years = re.findall(r'\b(20\d{2})\b', text)
@@ -637,7 +662,7 @@ class HealthcareAnalyzer:
637
  return "Unknown"
638
 
639
  def _extract_facility_types(self, text):
640
- """Extract facility types from text"""
641
  types = []
642
  if "hospital" in text.lower():
643
  types.append("Hospitals")
@@ -648,7 +673,7 @@ class HealthcareAnalyzer:
648
  return types
649
 
650
  def _extract_metrics(self, text):
651
- """Extract required metrics from text"""
652
  metrics = []
653
  if "bed" in text.lower():
654
  metrics.append("Bed capacity")
@@ -659,7 +684,7 @@ class HealthcareAnalyzer:
659
  return metrics
660
 
661
  def _identify_relevant_data(self, text):
662
- """Identify relevant datasets for the scenario"""
663
  # Use data registry's find_related_datasets method
664
  keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
665
  return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]
 
82
  # Look for region names in the scenario
83
  regions = []
84
 
85
+ # Generic region patterns - works for any healthcare scenario
86
  region_patterns = [
87
  r'([A-Z][a-z]+ (Zone|Region|Area|District))',
88
+ r'(North|South|East|West|Central)',
89
+ r'([A-Z][a-z]+ (City|County|State|Province))',
90
+ r'([A-Z][a-z]+)'
91
  ]
92
 
93
  import re
 
116
  if df is None or df.empty:
117
  continue
118
 
119
+ # Filter data based on geographic scope - generic approach
120
  filtered_df = self._filter_by_geography(df, geographic_scope, regions)
121
 
122
  if filtered_df.empty:
123
  continue
124
 
125
+ # Facility type distribution - generic column finding
126
  type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
127
  if type_col:
128
  # Ensure we're working with string data
 
134
  diversity = self._calculate_diversity_index(type_dist)
135
  results["facility_diversity"] = diversity
136
 
137
+ # Geographic distribution - generic column finding
138
  geo_col = self._find_column(filtered_df, ['province', 'state', 'region', 'zone', 'area'])
139
  if geo_col:
140
  # Ensure we're working with string data
 
146
  gini = self._calculate_gini(list(geo_dist.values()))
147
  results["geographic_inequality"] = gini
148
 
149
+ # City distribution - generic column finding
150
  city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
151
  if city_col:
152
  # Ensure we're working with string data
 
180
  if df is None or df.empty:
181
  continue
182
 
183
+ # Filter data based on geographic scope - generic approach
184
  filtered_df = self._filter_by_geography(df, geographic_scope, regions)
185
 
186
  if filtered_df.empty:
187
  continue
188
 
189
+ # Current capacity - generic column finding
190
  capacity_col = self._find_column(filtered_df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
191
  if capacity_col:
192
  # Ensure we're working with numeric data
 
213
  utilization_by_type = filtered_df.groupby(type_col)[utilization_col].mean().to_dict()
214
  results["utilization_by_type"] = utilization_by_type
215
 
216
+ # Capacity trends - generic approach for time columns
217
  time_cols = [col for col in filtered_df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
218
  if len(time_cols) >= 2:
219
  trend_data = {}
 
231
  growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
232
  results["capacity_growth_rate"] = growth_rate
233
 
234
+ # Bed change analysis - generic column finding
235
  prev_col = self._find_column(filtered_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
236
  current_col = self._find_column(filtered_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
237
 
 
249
  axis=1
250
  )
251
 
252
+ # Zone/Region-level analysis - generic column finding
253
  zone_col = self._find_column(filtered_df, ['zone', 'region', 'area', 'district'])
254
  if zone_col:
255
  # Ensure we're working with string data
 
290
  return results
291
 
292
  def _filter_by_geography(self, df: pd.DataFrame, geographic_scope: str, regions: List[str]) -> pd.DataFrame:
293
+ """Filter dataframe based on geographic scope and regions - generic approach"""
294
  if geographic_scope == "Unknown" and not regions:
295
  return df.copy()
296
 
297
+ # Try to find a geographic column - generic approach
298
  geo_col = self._find_column(df, ['province', 'state', 'region', 'zone', 'area', 'district'])
299
 
300
  if geo_col is None:
301
  return df.copy()
302
 
303
  # Ensure we're working with string data
304
+ try:
305
+ df[geo_col] = df[geo_col].astype(str)
306
+ except Exception as e:
307
+ logger.warning(f"Error converting column {geo_col} to string: {str(e)}")
308
+ return df.copy()
309
 
310
  # Create filters
311
  filters = []
312
 
313
+ # Add geographic scope filter - generic approach
314
  if geographic_scope != "Unknown":
315
  # Create a list of possible values for the geographic scope
316
  scope_values = [geographic_scope.lower()]
317
 
318
+ # Add common abbreviations - generic for any region
319
  abbreviations = {
320
+ # Canadian provinces
321
+ "alberta": "ab", "british columbia": "bc", "ontario": "on", "quebec": "qc",
322
+ "manitoba": "mb", "saskatchewan": "sk", "nova scotia": "ns", "new brunswick": "nb",
323
+ "prince edward island": "pe", "newfoundland": "nl", "yukon": "yt",
324
+ "northwest territories": "nt", "nunavut": "nu",
325
+ # US states
326
+ "alabama": "al", "alaska": "ak", "arizona": "az", "arkansas": "ar",
327
+ "california": "ca", "colorado": "co", "connecticut": "ct", "delaware": "de",
328
+ "florida": "fl", "georgia": "ga", "hawaii": "hi", "idaho": "id",
329
+ "illinois": "il", "indiana": "in", "iowa": "ia", "kansas": "ks",
330
+ "kentucky": "ky", "louisiana": "la", "maine": "me", "maryland": "md",
331
+ "massachusetts": "ma", "michigan": "mi", "minnesota": "mn", "mississippi": "ms",
332
+ "missouri": "mo", "montana": "mt", "nebraska": "ne", "nevada": "nv",
333
+ "new hampshire": "nh", "new jersey": "nj", "new mexico": "nm", "new york": "ny",
334
+ "north carolina": "nc", "north dakota": "nd", "ohio": "oh", "oklahoma": "ok",
335
+ "oregon": "or", "pennsylvania": "pa", "rhode island": "ri", "south carolina": "sc",
336
+ "south dakota": "sd", "tennessee": "tn", "texas": "tx", "utah": "ut",
337
+ "vermont": "vt", "virginia": "va", "washington": "wa", "west virginia": "wv",
338
+ "wisconsin": "wi", "wyoming": "wy"
339
  }
340
 
341
  if geographic_scope.lower() in abbreviations:
342
  scope_values.append(abbreviations[geographic_scope.lower()])
343
 
344
+ try:
345
+ scope_filter = df[geo_col].str.lower().isin(scope_values)
346
+ filters.append(scope_filter)
347
+ except Exception as e:
348
+ logger.warning(f"Error creating scope filter: {str(e)}")
349
 
350
+ # Add region filters - generic approach
351
  if regions:
352
+ try:
353
+ region_filter = df[geo_col].str.lower().isin([r.lower() for r in regions])
354
+ filters.append(region_filter)
355
+ except Exception as e:
356
+ logger.warning(f"Error creating region filter: {str(e)}")
357
 
358
  # Apply filters
359
  if filters:
360
+ try:
361
+ combined_filter = filters[0]
362
+ for f in filters[1:]:
363
+ combined_filter = combined_filter | f
364
+
365
+ return df[combined_filter].copy()
366
+ except Exception as e:
367
+ logger.warning(f"Error applying filters: {str(e)}")
368
 
369
  return df.copy()
370
 
371
  def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
372
+ """Analyze resource allocation patterns - generic approach"""
373
  results = {}
374
 
375
  for data_name in relevant_data:
 
377
  if df is None or df.empty:
378
  continue
379
 
380
+ # Staff analysis - generic column finding
381
  staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
382
  if staff_col:
383
  # Ensure we're working with numeric data
 
394
  avg_staff_per_bed = df['staff_per_bed'].mean()
395
  results["staff_per_bed_ratio"] = avg_staff_per_bed
396
 
397
+ # Equipment analysis - generic approach
398
  equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
399
  if equipment_cols:
400
  equipment_summary = {}
 
407
  return results
408
 
409
  def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
410
+ """Analyze trends in healthcare data - generic approach"""
411
  results = {}
412
 
413
  for data_name in relevant_data:
 
415
  if df is None or df.empty:
416
  continue
417
 
418
+ # Find time-based columns - generic approach
419
  time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
420
 
421
  if len(time_cols) >= 2:
 
445
  return results
446
 
447
  def generate_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
448
+ """Generate data-driven operational recommendations - generic approach"""
449
  recommendations = []
450
  geographic_scope = requirements.get("geographic_scope", "the region")
451
 
 
471
  "data_source": "Capacity trend analysis"
472
  })
473
 
474
+ # Zone-specific recommendations - generic approach
475
  if "max_percentage_decrease" in capacity and isinstance(capacity["max_percentage_decrease"], dict):
476
+ # Try to find the zone name using multiple possible keys
477
+ zone_name = "a zone"
478
+ for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
479
+ if key in capacity["max_percentage_decrease"]:
480
+ zone_name = capacity["max_percentage_decrease"][key]
481
+ break
482
+
483
  decrease = capacity["max_percentage_decrease"].get("percent_change", 0)
484
 
485
+ if zone_name and decrease:
486
  recommendations.append({
487
+ "title": f"Address Capacity Decline in {zone_name}",
488
+ "description": f"{zone_name} shows a {decrease:.1f}% decrease in bed capacity. Investigate causes and implement recovery strategies.",
489
  "priority": "High",
490
  "data_source": "Zone capacity analysis"
491
  })
 
521
  return recommendations
522
 
523
  def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
524
+ """Identify opportunities for AI integration and data enhancement - generic approach"""
525
  opportunities = {
526
  "data_integration": [],
527
  "ai_applications": [],
 
571
 
572
  # Helper methods
573
  def _find_column(self, df, patterns):
574
+ """Find the first column matching any pattern - generic approach"""
575
  if df is None or df.empty:
576
  return None
577
  for col in df.columns:
 
580
  return None
581
 
582
  def _calculate_gini(self, values):
583
+ """Calculate Gini coefficient for inequality measurement - generic approach"""
584
  if not values or len(values) < 2:
585
  return 0
586
 
 
596
  return gini
597
 
598
  def _calculate_diversity_index(self, distribution):
599
+ """Calculate Shannon diversity index - generic approach"""
600
  if not distribution:
601
  return 0
602
 
 
611
  return -sum(p * np.log(p) for p in proportions)
612
 
613
  def _extract_geographic_scope(self, text):
614
+ """Extract geographic scope from text - generic approach"""
615
+ # Look for province/state names - generic for any region
616
  provinces = [
617
  "alberta", "british columbia", "ontario", "quebec", "manitoba",
618
  "saskatchewan", "nova scotia", "new brunswick", "prince edward island",
 
653
  return "Unknown"
654
 
655
  def _extract_time_period(self, text):
656
+ """Extract time period from text - generic approach"""
657
  # Look for year patterns
658
  import re
659
  years = re.findall(r'\b(20\d{2})\b', text)
 
662
  return "Unknown"
663
 
664
  def _extract_facility_types(self, text):
665
+ """Extract facility types from text - generic approach"""
666
  types = []
667
  if "hospital" in text.lower():
668
  types.append("Hospitals")
 
673
  return types
674
 
675
  def _extract_metrics(self, text):
676
+ """Extract required metrics from text - generic approach"""
677
  metrics = []
678
  if "bed" in text.lower():
679
  metrics.append("Bed capacity")
 
684
  return metrics
685
 
686
  def _identify_relevant_data(self, text):
687
+ """Identify relevant datasets for the scenario - generic approach"""
688
  # Use data registry's find_related_datasets method
689
  keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
690
  return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]