# healthcare_analysis.py import pandas as pd import numpy as np from typing import Dict, List, Any, Optional, Tuple import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class HealthcareAnalyzer: def __init__(self, data_registry): self.data_registry = data_registry self.analysis_results = {} def comprehensive_analysis(self, scenario_text: str) -> Dict[str, Any]: """Perform comprehensive healthcare scenario analysis""" logger.info("Starting comprehensive healthcare analysis") # Extract tasks and requirements tasks = self._extract_tasks(scenario_text) requirements = self._extract_requirements(scenario_text) # Identify relevant datasets relevant_data = self._identify_relevant_data(scenario_text) # Perform analyses based on tasks results = {} if "facility_distribution" in tasks: results["facility_distribution"] = self.analyze_facility_distribution(relevant_data) if "capacity_analysis" in tasks: results["capacity_analysis"] = self.analyze_capacity(relevant_data) if "resource_allocation" in tasks: results["resource_allocation"] = self.analyze_resource_allocation(relevant_data) if "trends" in tasks: results["trends"] = self.analyze_trends(relevant_data) # Generate recommendations results["recommendations"] = self.generate_recommendations(results, requirements) # Future integration opportunities results["future_integration"] = self.identify_integration_opportunities(results) logger.info("Comprehensive analysis completed") return results def _extract_tasks(self, scenario_text: str) -> List[str]: """Extract specific tasks from scenario text""" tasks = [] task_keywords = { "facility_distribution": ["facility", "distribution", "location", "sites"], "capacity_analysis": ["capacity", "beds", "occupancy", "utilization"], "resource_allocation": ["resource", "allocation", "staffing", "equipment"], "trends": ["trend", "change", "growth", "decline", "pattern"] } for task_type, keywords in task_keywords.items(): if any(kw in scenario_text.lower() for kw in keywords): tasks.append(task_type) return tasks def _extract_requirements(self, scenario_text: str) -> Dict[str, Any]: """Extract specific requirements from scenario text""" return { "geographic_scope": self._extract_geographic_scope(scenario_text), "time_period": self._extract_time_period(scenario_text), "facility_types": self._extract_facility_types(scenario_text), "metrics_needed": self._extract_metrics(scenario_text) } def analyze_facility_distribution(self, relevant_data: List[str]) -> Dict[str, Any]: """Enhanced facility distribution analysis""" results = {} for data_name in relevant_data: df = self.data_registry.get(data_name) if df is None: continue # Geographic distribution geo_col = self._find_column(df, ['province', 'state', 'region', 'zone']) if geo_col: geo_dist = df[geo_col].value_counts().to_dict() results["geographic_distribution"] = geo_dist # Calculate Gini coefficient for inequality gini = self._calculate_gini(list(geo_dist.values())) results["geographic_inequality"] = gini # Facility type distribution type_col = self._find_column(df, ['type', 'category', 'facility_type']) if type_col: type_dist = df[type_col].value_counts().to_dict() results["facility_type_distribution"] = type_dist # Calculate diversity index diversity = self._calculate_diversity_index(type_dist) results["facility_diversity"] = diversity # Urban vs rural distribution urban_col = self._find_column(df, ['urban', 'rural', 'location_type']) if urban_col: urban_rural = df[urban_col].value_counts().to_dict() results["urban_rural_distribution"] = urban_rural return results def analyze_capacity(self, relevant_data: List[str]) -> Dict[str, Any]: """Enhanced capacity analysis""" results = {} for data_name in relevant_data: df = self.data_registry.get(data_name) if df is None: continue # Current capacity capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity']) if capacity_col: total_capacity = df[capacity_col].sum() results["total_capacity"] = total_capacity # Capacity by facility type type_col = self._find_column(df, ['type', 'facility_type']) if type_col: capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict() results["capacity_by_type"] = capacity_by_type # Capacity utilization utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate']) if utilization_col: avg_utilization = df[utilization_col].mean() results["average_utilization"] = avg_utilization # Utilization by facility type if type_col: utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict() results["utilization_by_type"] = utilization_by_type # Capacity trends time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])] if len(time_cols) >= 2: trend_data = {} for col in time_cols: trend_data[col] = df[col].sum() results["capacity_trends"] = trend_data # Calculate growth rate if len(time_cols) >= 2: latest = time_cols[-1] earliest = time_cols[0] growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100 results["capacity_growth_rate"] = growth_rate return results def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]: """Analyze resource allocation patterns""" results = {} for data_name in relevant_data: df = self.data_registry.get(data_name) if df is None: continue # Staff analysis staff_col = self._find_column(df, ['staff', 'employees', 'fte']) if staff_col: total_staff = df[staff_col].sum() results["total_staff"] = total_staff # Staff per bed ratio capacity_col = self._find_column(df, ['capacity', 'beds']) if capacity_col: df['staff_per_bed'] = df[staff_col] / df[capacity_col] avg_staff_per_bed = df['staff_per_bed'].mean() results["staff_per_bed_ratio"] = avg_staff_per_bed # Equipment analysis equipment_cols = [col for col in df.columns if 'equipment' in col.lower()] if equipment_cols: equipment_summary = {} for col in equipment_cols: equipment_summary[col] = df[col].sum() results["equipment_summary"] = equipment_summary return results def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]: """Analyze trends in healthcare data""" results = {} for data_name in relevant_data: df = self.data_registry.get(data_name) if df is None: continue # Find time-based columns time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])] if len(time_cols) >= 2: trends = {} # Calculate year-over-year changes for i in range(1, len(time_cols)): prev_year = time_cols[i-1] curr_year = time_cols[i] prev_total = df[prev_year].sum() curr_total = df[curr_year].sum() if prev_total > 0: change_pct = (curr_total - prev_total) / prev_total * 100 trends[f"{prev_year}_to_{curr_year}"] = { "absolute_change": curr_total - prev_total, "percentage_change": change_pct } results["year_over_year_trends"] = trends return results def generate_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]: """Generate data-driven operational recommendations""" recommendations = [] # Capacity-related recommendations if "capacity_analysis" in analysis_results: capacity = analysis_results["capacity_analysis"] # Low utilization recommendations if "average_utilization" in capacity and capacity["average_utilization"] < 0.7: recommendations.append({ "title": "Optimize Underutilized Capacity", "description": f"Average utilization is {capacity['average_utilization']:.1%}. Consider repurposing underutilized facilities or consolidating services.", "priority": "Medium", "data_source": "Capacity utilization analysis" }) # Capacity growth recommendations if "capacity_growth_rate" in capacity and capacity["capacity_growth_rate"] < 2: recommendations.append({ "title": "Expand Capacity Strategically", "description": f"Capacity growth rate is only {capacity['capacity_growth_rate']:.1f}%. Invest in new facilities or expand existing ones to meet demand.", "priority": "High", "data_source": "Capacity trend analysis" }) # Geographic distribution recommendations if "facility_distribution" in analysis_results: dist = analysis_results["facility_distribution"] if "geographic_inequality" in dist and dist["geographic_inequality"] > 0.4: recommendations.append({ "title": "Address Geographic Inequity", "description": f"High geographic inequality (Gini: {dist['geographic_inequality']:.2f}). Consider targeted investments in underserved areas.", "priority": "High", "data_source": "Geographic distribution analysis" }) # Resource allocation recommendations if "resource_allocation" in analysis_results: resources = analysis_results["resource_allocation"] if "staff_per_bed_ratio" in resources and resources["staff_per_bed_ratio"] < 1.5: recommendations.append({ "title": "Increase Staffing Levels", "description": f"Staff per bed ratio is {resources['staff_per_bed_ratio']:.2f}, which may be insufficient. Consider hiring additional staff.", "priority": "High", "data_source": "Resource allocation analysis" }) # Sort by priority priority_order = {"High": 0, "Medium": 1, "Low": 2} recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3)) return recommendations def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]: """Identify opportunities for AI integration and data enhancement""" opportunities = { "data_integration": [], "ai_applications": [], "enhanced_metrics": [] } # Data integration opportunities opportunities["data_integration"].append({ "opportunity": "Integrate real-time occupancy data", "description": "Combine current facility data with real-time occupancy monitoring systems", "benefit": "Enable dynamic resource allocation and surge planning" }) opportunities["data_integration"].append({ "opportunity": "Incorporate demographic data", "description": "Add population demographics and health needs data", "benefit": "Improve demand forecasting and service planning" }) # AI application opportunities opportunities["ai_applications"].append({ "opportunity": "Predictive capacity modeling", "description": "Use ML to forecast capacity needs based on trends and external factors", "benefit": "Proactive resource planning and reduced wait times" }) opportunities["ai_applications"].append({ "opportunity": "Optimization algorithms", "description": "Implement AI for staff scheduling and resource allocation", "benefit": "Improved efficiency and reduced operational costs" }) # Enhanced metrics opportunities["enhanced_metrics"].append({ "metric": "Patient flow efficiency", "description": "Measure time from admission to discharge across facilities", "benefit": "Identify bottlenecks and improve patient experience" }) opportunities["enhanced_metrics"].append({ "metric": "Resource utilization index", "description": "Composite metric combining staff, equipment, and space utilization", "benefit": "Holistic view of operational efficiency" }) return opportunities # Helper methods def _find_column(self, df, patterns): """Find the first column matching any pattern""" for col in df.columns: if any(pattern.lower() in col.lower() for pattern in patterns): return col return None def _calculate_gini(self, values): """Calculate Gini coefficient for inequality measurement""" values = sorted(values) n = len(values) index = np.arange(1, n + 1) gini = (np.sum((2 * index - n - 1) * values)) / (n * np.sum(values)) return gini def _calculate_diversity_index(self, distribution): """Calculate Shannon diversity index""" total = sum(distribution.values()) if total == 0: return 0 proportions = [count/total for count in distribution.values()] return -sum(p * np.log(p) for p in proportions if p > 0) def _extract_geographic_scope(self, text): """Extract geographic scope from text""" # Simple keyword-based extraction if "alberta" in text.lower(): return "Alberta" elif "canada" in text.lower(): return "Canada" return "Unknown" def _extract_time_period(self, text): """Extract time period from text""" # Look for year patterns import re years = re.findall(r'\b(20\d{2})\b', text) if len(years) >= 2: return f"{min(years)}-{max(years)}" return "Unknown" def _extract_facility_types(self, text): """Extract facility types from text""" types = [] if "hospital" in text.lower(): types.append("Hospitals") if "nursing" in text.lower() or "long-term" in text.lower(): types.append("Nursing homes") if "clinic" in text.lower(): types.append("Clinics") return types def _extract_metrics(self, text): """Extract required metrics from text""" metrics = [] if "bed" in text.lower(): metrics.append("Bed capacity") if "occupancy" in text.lower(): metrics.append("Occupancy rates") if "staff" in text.lower(): metrics.append("Staffing levels") return metrics def _identify_relevant_data(self, text): """Identify relevant datasets for the scenario""" # Use data registry's find_related_datasets method keywords = ["facility", "bed", "capacity", "healthcare", "hospital"] return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]