Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 24

Commit

1134cbf

verified ·

1 Parent(s): 33bf7ab

Update healthcare_analysis.py

Browse files

Files changed (1) hide show

healthcare_analysis.py +8 -927

healthcare_analysis.py CHANGED Viewed

@@ -1,932 +1,13 @@
 # healthcare_analysis.py
 import pandas as pd
-import numpy as np
-from typing import Dict, List, Any, Optional, Tuple
-import logging
-import re
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 class HealthcareAnalyzer:
-    def __init__(self, data_registry):
-        self.data_registry = data_registry
-        self.analysis_results = {}
-        self.scenario_text = ""
-    def comprehensive_analysis(self, scenario_text: str) -> Dict[str, Any]:
-        """Perform comprehensive healthcare scenario analysis"""
-        logger.info("Starting comprehensive healthcare analysis")
-        self.scenario_text = scenario_text
-        # Extract all requirements and tasks
-        requirements = self._extract_all_requirements(scenario_text)
-        tasks = self._extract_detailed_tasks(scenario_text)
-        # Identify relevant datasets
-        relevant_data = self._identify_relevant_data(scenario_text)
-        # Perform all analyses based on tasks
-        results = {
-            "requirements": requirements,
-            "tasks_completed": [],
-            "data_sources": relevant_data
-        }
-        # Data Preparation Tasks
-        if "data_preparation" in tasks:
-            results["data_preparation"] = self.analyze_data_preparation(relevant_data, requirements)
-            results["tasks_completed"].append("data_preparation")
-        # Facility Distribution Analysis
-        if "facility_distribution" in tasks:
-            results["facility_distribution"] = self.analyze_facility_distribution(relevant_data, requirements)
-            results["tasks_completed"].append("facility_distribution")
-        # Capacity Analysis
-        if "capacity_analysis" in tasks:
-            results["capacity_analysis"] = self.analyze_capacity(relevant_data, requirements)
-            results["tasks_completed"].append("capacity_analysis")
-        # Long-Term Care Assessment (specific to scenario requirements)
-        if "long_term_care_assessment" in tasks:
-            results["long_term_care_assessment"] = self.analyze_long_term_care_capacity(results, requirements)
-            results["tasks_completed"].append("long_term_care_assessment")
-        # Resource Allocation Analysis
-        if "resource_allocation" in tasks:
-            results["resource_allocation"] = self.analyze_resource_allocation(relevant_data)
-            results["tasks_completed"].append("resource_allocation")
-        # Trends Analysis
-        if "trends" in tasks:
-            results["trends"] = self.analyze_trends(relevant_data)
-            results["tasks_completed"].append("trends")
-        # Generate recommendations
-        if "operational_recommendations" in tasks:
-            results["recommendations"] = self.generate_operational_recommendations(results, requirements)
-            results["tasks_completed"].append("operational_recommendations")
-        # Future Integration Opportunities
-        if "future_integration" in tasks:
-            results["future_integration"] = self.identify_integration_opportunities(results)
-            results["tasks_completed"].append("future_integration")
-        # Validate that all required tasks were completed
-        validation_result = self.validate_analysis_completeness(tasks, results["tasks_completed"])
-        results["validation"] = validation_result
-        logger.info("Comprehensive analysis completed")
-        return results
-    def _extract_all_requirements(self, scenario_text: str) -> Dict[str, Any]:
-        """Extract all specific requirements from scenario text"""
-        requirements = {
-            "geographic_scope": self._extract_geographic_scope(scenario_text),
-            "time_period": self._extract_time_period(scenario_text),
-            "facility_types": self._extract_facility_types(scenario_text),
-            "metrics_needed": self._extract_metrics(scenario_text),
-            "regions": self._extract_regions(scenario_text),
-            "data_files": self._extract_data_files(scenario_text),
-            "specific_questions": self._extract_specific_questions(scenario_text)
-        }
-        return requirements
-    def _extract_detailed_tasks(self, scenario_text: str) -> List[str]:
-        """Extract detailed tasks from scenario text"""
-        tasks = []
-        text_lower = scenario_text.lower()
-        # Data preparation tasks
-        if any(phrase in text_lower for phrase in ["load the data", "data preparation", "frequency table"]):
-            tasks.append("data_preparation")
-        # Facility distribution tasks
-        if any(phrase in text_lower for phrase in ["facility distribution", "cities with highest", "facility type"]):
-            tasks.append("facility_distribution")
-        # Capacity analysis tasks
-        if any(phrase in text_lower for phrase in ["bed capacity", "capacity analysis", "bed_change"]):
-            tasks.append("capacity_analysis")
-        # Long-term care assessment tasks
-        if any(phrase in text_lower for phrase in ["long-term care", "long term care", "nursing care"]):
-            tasks.append("long_term_care_assessment")
-        # Resource allocation tasks
-        if any(phrase in text_lower for phrase in ["resource allocation", "staffing", "equipment"]):
-            tasks.append("resource_allocation")
-        # Trends analysis tasks
-        if any(phrase in text_lower for phrase in ["trends", "change", "growth", "decline"]):
-            tasks.append("trends")
-        # Operational recommendations tasks
-        if any(phrase in text_lower for phrase in ["operational recommendations", "recommend actions", "mitigate shortages"]):
-            tasks.append("operational_recommendations")
-        # Future integration tasks
-        if any(phrase in text_lower for phrase in ["future integration", "augmented ai", "decision-making"]):
-            tasks.append("future_integration")
-        return tasks
-    def _extract_specific_questions(self, scenario_text: str) -> List[str]:
-        """Extract specific questions from scenario text"""
-        questions = []
-        # Look for question patterns
-        question_patterns = [
-            r'which zone shows the largest',
-            r'which zone has the largest',
-            r'list the five',
-            r'does this city have',
-            r'provide the numbers to justify',
-            r'propose at least',
-            r'mention at least'
-        ]
-        for pattern in question_patterns:
-            matches = re.findall(pattern, scenario_text, re.IGNORECASE)
-            questions.extend(matches)
-        return questions
-    def _extract_data_files(self, scenario_text: str) -> List[str]:
-        """Extract data file names from scenario text"""
-        files = []
-        # Look for file patterns
-        file_patterns = [
-            r'([a-zA-Z_]+\.csv)',
-            r'([a-zA-Z_]+\.xlsx)',
-            r'([a-zA-Z_]+\.json)'
-        ]
-        for pattern in file_patterns:
-            matches = re.findall(pattern, scenario_text)
-            files.extend(matches)
-        return list(set(files))  # Remove duplicates
-    def analyze_data_preparation(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
-        """Enhanced data preparation analysis"""
-        results = {}
-        geographic_scope = requirements.get("geographic_scope", "Unknown")
-        regions = requirements.get("regions", [])
-        for data_name in relevant_data:
-            df = self.data_registry.get(data_name)
-            if df is None or df.empty:
-                continue
-            # Filter data based on geographic scope
-            filtered_df = self._filter_by_geography(df, geographic_scope, regions)
-            if filtered_df.empty:
-                continue
-            # Facility type frequency table
-            type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
-            if type_col:
-                filtered_df[type_col] = filtered_df[type_col].astype(str)
-                type_freq = filtered_df[type_col].value_counts().to_dict()
-                results["facility_type_frequency"] = type_freq
-            # Top cities analysis
-            city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
-            if city_col:
-                filtered_df[city_col] = filtered_df[city_col].astype(str)
-                city_counts = filtered_df[city_col].value_counts().head(5)
-                top_cities = city_counts.index.tolist()
-                # Breakdown by facility type for each top city
-                city_breakdown = {}
-                for city in top_cities:
-                    city_data = filtered_df[filtered_df[city_col] == city]
-                    if not city_data.empty and type_col in city_data.columns:
-                        city_breakdown[city] = city_data[type_col].value_counts().to_dict()
-                results["top_cities"] = top_cities
-                results["city_facility_breakdown"] = city_breakdown
-                # Total facilities count
-                results["total_facilities"] = len(filtered_df)
-        return results
-    def analyze_long_term_care_capacity(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> Dict[str, Any]:
-        """Analyze long-term care capacity based on scenario requirements"""
-        results = {}
-        # Get the zone with the largest percentage decrease from capacity analysis
-        if "capacity_analysis" in analysis_results:
-            capacity_data = analysis_results["capacity_analysis"]
-            # Find the zone with largest percentage decrease
-            max_pct_decrease = capacity_data.get("max_percentage_decrease", {})
-            # Extract zone name (try multiple possible keys)
-            zone_name = None
-            for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
-                if key in max_pct_decrease:
-                    zone_name = max_pct_decrease[key]
-                    break
-            if zone_name:
-                results["zone_with_largest_decrease"] = zone_name
-                # Get facility distribution data
-                if "facility_distribution" in analysis_results:
-                    facility_data = analysis_results["facility_distribution"]
-                    # Find the major city in this zone
-                    major_city = self._find_major_city_in_zone(zone_name, facility_data, requirements)
-                    if major_city:
-                        results["major_city"] = major_city
-                        # Analyze long-term care capacity in this city
-                        city_breakdown = facility_data.get("city_facility_breakdown", {})
-                        if major_city in city_breakdown:
-                            facilities_in_city = city_breakdown[major_city]
-                            # Count different facility types
-                            hospitals = facilities_in_city.get("Hospitals", 0)
-                            nursing_care = facilities_in_city.get("Nursing and residential care facilities", 0)
-                            ambulatory = facilities_in_city.get("Ambulatory health care services", 0)
-                            results["facility_counts"] = {
-                                "hospitals": hospitals,
-                                "nursing_residential_care": nursing_care,
-                                "ambulatory": ambulatory
-                            }
-                            # Calculate ratio and assess sufficiency
-                            if hospitals > 0:
-                                ratio = nursing_care / hospitals
-                                results["nursing_to_hospital_ratio"] = ratio
-                                # Assess capacity
-                                if ratio >= 1.5:
-                                    results["capacity_assessment"] = "sufficient"
-                                else:
-                                    results["capacity_assessment"] = "insufficient"
-                            else:
-                                results["capacity_assessment"] = "insufficient (no hospitals)"
-        return results
-    def _find_major_city_in_zone(self, zone_name: str, facility_data: Dict[str, Any], requirements: Dict[str, Any]) -> Optional[str]:
-        """Find the major city in a given zone"""
-        # This is a simplified approach - in a real implementation, you would need
-        # zone-to-city mapping data or more sophisticated geospatial analysis
-        # For now, we'll use the city with the most facilities as the major city
-        top_cities = facility_data.get("top_cities", [])
-        if top_cities:
-            # In a real implementation, you would check which city belongs to the zone
-            # For now, we'll return the first city as a placeholder
-            return top_cities[0]
-        return None
-    def generate_operational_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
-        """Generate comprehensive operational recommendations"""
-        recommendations = []
-        geographic_scope = requirements.get("geographic_scope", "the region")
-        # Capacity-related recommendations
-        if "capacity_analysis" in analysis_results:
-            capacity = analysis_results["capacity_analysis"]
-            # Low utilization recommendations
-            if "average_utilization" in capacity and capacity["average_utilization"] < 0.7:
-                recommendations.append({
-                    "title": "Optimize Underutilized Capacity",
-                    "description": f"Average utilization is {capacity['average_utilization']:.1%} in {geographic_scope}. Consider repurposing underutilized facilities or consolidating services.",
-                    "priority": "Medium",
-                    "data_source": "Capacity utilization analysis"
-                })
-            # Capacity growth recommendations
-            if "capacity_growth_rate" in capacity and capacity["capacity_growth_rate"] < 2:
-                recommendations.append({
-                    "title": "Expand Capacity Strategically",
-                    "description": f"Capacity growth rate is only {capacity['capacity_growth_rate']:.1f}% in {geographic_scope}. Invest in new facilities or expand existing ones to meet demand.",
-                    "priority": "High",
-                    "data_source": "Capacity trend analysis"
-                })
-            # Zone-specific recommendations
-            if "max_percentage_decrease" in capacity and isinstance(capacity["max_percentage_decrease"], dict):
-                zone_name = "a zone"
-                for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
-                    if key in capacity["max_percentage_decrease"]:
-                        zone_name = capacity["max_percentage_decrease"][key]
-                        break
-                decrease = capacity["max_percentage_decrease"].get("percent_change", 0)
-                if zone_name and decrease:
-                    recommendations.append({
-                        "title": f"Address Capacity Decline in {zone_name}",
-                        "description": f"{zone_name} shows a {decrease:.1f}% decrease in bed capacity. Investigate causes and implement recovery strategies.",
-                        "priority": "High",
-                        "data_source": "Zone capacity analysis"
-                    })
-        # Long-term care recommendations
-        if "long_term_care_assessment" in analysis_results:
-            ltc_data = analysis_results["long_term_care_assessment"]
-            if ltc_data.get("capacity_assessment") == "insufficient":
-                major_city = ltc_data.get("major_city", "the major city")
-                ratio = ltc_data.get("nursing_to_hospital_ratio", 0)
-                recommendations.append({
-                    "title": f"Expand Long-Term Care Capacity in {major_city}",
-                    "description": f"Nursing/residential care to hospital ratio is {ratio:.2f} in {major_city}, which is insufficient. Invest in new long-term care beds or repurpose existing facilities.",
-                    "priority": "High",
-                    "data_source": "Long-term care capacity assessment"
-                })
-        # Resource allocation recommendations
-        if "resource_allocation" in analysis_results:
-            resources = analysis_results["resource_allocation"]
-            if "staff_per_bed_ratio" in resources and resources["staff_per_bed_ratio"] < 1.5:
-                recommendations.append({
-                    "title": "Increase Staffing Levels",
-                    "description": f"Staff per bed ratio is {resources['staff_per_bed_ratio']:.2f} in {geographic_scope}, which may be insufficient. Consider hiring additional staff.",
-                    "priority": "High",
-                    "data_source": "Resource allocation analysis"
-                })
-        # Ensure we have at least 3 recommendations as required
-        while len(recommendations) < 3:
-            recommendations.append({
-                "title": "Implement Comprehensive Capacity Management",
-                "description": "Develop a comprehensive capacity management system that includes real-time monitoring, predictive analytics, and dynamic resource allocation.",
-                "priority": "Medium",
-                "data_source": "General best practices"
-            })
-        # Sort by priority
-        priority_order = {"High": 0, "Medium": 1, "Low": 2}
-        recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3))
-        return recommendations
-    def validate_analysis_completeness(self, required_tasks: List[str], completed_tasks: List[str]) -> Dict[str, Any]:
-        """Validate that all required tasks were completed"""
-        validation = {
-            "all_tasks_completed": True,
-            "missing_tasks": [],
-            "completion_rate": len(completed_tasks) / len(required_tasks) if required_tasks else 0
-        }
-        for task in required_tasks:
-            if task not in completed_tasks:
-                validation["all_tasks_completed"] = False
-                validation["missing_tasks"].append(task)
-        return validation
-    def analyze_facility_distribution(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
-        """Enhanced facility distribution analysis"""
-        results = {}
-        geographic_scope = requirements.get("geographic_scope", "Unknown")
-        regions = requirements.get("regions", [])
-        for data_name in relevant_data:
-            df = self.data_registry.get(data_name)
-            if df is None or df.empty:
-                continue
-            # Filter data based on geographic scope
-            filtered_df = self._filter_by_geography(df, geographic_scope, regions)
-            if filtered_df.empty:
-                continue
-            # Facility type distribution
-            type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
-            if type_col:
-                # Ensure we're working with string data
-                filtered_df[type_col] = filtered_df[type_col].astype(str)
-                type_dist = filtered_df[type_col].value_counts().to_dict()
-                results["facility_type_distribution"] = type_dist
-                # Calculate diversity index
-                diversity = self._calculate_diversity_index(type_dist)
-                results["facility_diversity"] = diversity
-            # Geographic distribution
-            geo_col = self._find_column(filtered_df, ['province', 'state', 'region', 'zone', 'area'])
-            if geo_col:
-                # Ensure we're working with string data
-                filtered_df[geo_col] = filtered_df[geo_col].astype(str)
-                geo_dist = filtered_df[geo_col].value_counts().to_dict()
-                results["geographic_distribution"] = geo_dist
-                # Calculate Gini coefficient for inequality
-                gini = self._calculate_gini(list(geo_dist.values()))
-                results["geographic_inequality"] = gini
-            # City distribution
-            city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
-            if city_col:
-                # Ensure we're working with string data
-                filtered_df[city_col] = filtered_df[city_col].astype(str)
-                city_counts = filtered_df[city_col].value_counts().head(5)
-                top_cities = city_counts.index.tolist()
-                # Breakdown by facility type for top cities
-                city_breakdown = {}
-                for city in top_cities:
-                    city_data = filtered_df[filtered_df[city_col] == city]
-                    if not city_data.empty and type_col in city_data.columns:
-                        city_breakdown[city] = city_data[type_col].value_counts().to_dict()
-                results["top_cities"] = top_cities
-                results["city_breakdown"] = city_breakdown
-                # Total facilities count
-                results["total_facilities"] = len(filtered_df)
-        return results
-    def analyze_capacity(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
-        """Enhanced capacity analysis"""
-        results = {}
-        geographic_scope = requirements.get("geographic_scope", "Unknown")
-        regions = requirements.get("regions", [])
-        for data_name in relevant_data:
-            df = self.data_registry.get(data_name)
-            if df is None or df.empty:
-                continue
-            # Filter data based on geographic scope
-            filtered_df = self._filter_by_geography(df, geographic_scope, regions)
-            if filtered_df.empty:
-                continue
-            # Current capacity
-            capacity_col = self._find_column(filtered_df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
-            if capacity_col:
-                # Ensure we're working with numeric data
-                filtered_df[capacity_col] = pd.to_numeric(filtered_df[capacity_col], errors='coerce')
-                total_capacity = filtered_df[capacity_col].sum()
-                results["total_capacity"] = total_capacity
-                # Capacity by facility type
-                type_col = self._find_column(filtered_df, ['type', 'facility_type'])
-                if type_col and type_col in filtered_df.columns:
-                    capacity_by_type = filtered_df.groupby(type_col)[capacity_col].sum().to_dict()
-                    results["capacity_by_type"] = capacity_by_type
-                # Capacity utilization
-                utilization_col = self._find_column(filtered_df, ['utilization', 'occupancy', 'occupancy_rate'])
-                if utilization_col:
-                    # Ensure we're working with numeric data
-                    filtered_df[utilization_col] = pd.to_numeric(filtered_df[utilization_col], errors='coerce')
-                    avg_utilization = filtered_df[utilization_col].mean()
-                    results["average_utilization"] = avg_utilization
-                    # Utilization by facility type
-                    if type_col and type_col in filtered_df.columns:
-                        utilization_by_type = filtered_df.groupby(type_col)[utilization_col].mean().to_dict()
-                        results["utilization_by_type"] = utilization_by_type
-                # Capacity trends
-                time_cols = [col for col in filtered_df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
-                if len(time_cols) >= 2:
-                    trend_data = {}
-                    for col in time_cols:
-                        # Ensure we're working with numeric data
-                        filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')
-                        trend_data[col] = filtered_df[col].sum()
-                    results["capacity_trends"] = trend_data
-                    # Calculate growth rate
-                    if len(time_cols) >= 2:
-                        latest = time_cols[-1]
-                        earliest = time_cols[0]
-                        if trend_data[earliest] > 0:  # Avoid division by zero
-                            growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
-                            results["capacity_growth_rate"] = growth_rate
-            # Bed change analysis
-            prev_col = self._find_column(filtered_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
-            current_col = self._find_column(filtered_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
-            if prev_col and current_col:
-                # Ensure we're working with numeric data
-                filtered_df[prev_col] = pd.to_numeric(filtered_df[prev_col], errors='coerce')
-                filtered_df[current_col] = pd.to_numeric(filtered_df[current_col], errors='coerce')
-                # Calculate bed change
-                filtered_df['bed_change'] = filtered_df[current_col] - filtered_df[prev_col]
-                # Calculate percentage change
-                filtered_df['percent_change'] = filtered_df.apply(
-                    lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
-                    axis=1
-                )
-                # Zone/Region-level analysis
-                zone_col = self._find_column(filtered_df, ['zone', 'region', 'area', 'district'])
-                if zone_col:
-                    # Ensure we're working with string data
-                    filtered_df[zone_col] = filtered_df[zone_col].astype(str)
-                    zone_summary = filtered_df.groupby(zone_col).agg({
-                        current_col: 'sum',
-                        prev_col: 'sum',
-                        'bed_change': 'sum'
-                    }).reset_index()
-                    zone_summary['percent_change'] = zone_summary.apply(
-                        lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
-                        axis=1
-                    )
-                    results["zone_summary"] = zone_summary.to_dict('records')
-                    # Find zones with largest changes
-                    if not zone_summary.empty:
-                        # Get zone with largest absolute decrease
-                        if zone_summary['bed_change'].notna().any():
-                            max_abs_decrease_idx = zone_summary['bed_change'].idxmin()
-                            max_abs_decrease = zone_summary.loc[max_abs_decrease_idx]
-                            results["max_absolute_decrease"] = max_abs_decrease.to_dict()
-                        # Get zone with largest percentage decrease
-                        if zone_summary['percent_change'].notna().any():
-                            max_pct_decrease_idx = zone_summary['percent_change'].idxmin()
-                            max_pct_decrease = zone_summary.loc[max_pct_decrease_idx]
-                            results["max_percentage_decrease"] = max_pct_decrease.to_dict()
-                    # Identify facilities with largest declines
-                    facilities_decline = filtered_df.sort_values('bed_change').head(5)
-                    if not facilities_decline.empty:
-                        results["facilities_with_largest_declines"] = facilities_decline.to_dict('records')
-        return results
-    def _filter_by_geography(self, df: pd.DataFrame, geographic_scope: str, regions: List[str]) -> pd.DataFrame:
-        """Filter dataframe based on geographic scope and regions"""
-        if geographic_scope == "Unknown" and not regions:
-            return df.copy()
-        # Try to find a geographic column
-        geo_col = self._find_column(df, ['province', 'state', 'region', 'zone', 'area', 'district'])
-        if geo_col is None:
-            return df.copy()
-        # Ensure we're working with string data
-        try:
-            df[geo_col] = df[geo_col].astype(str)
-        except Exception as e:
-            logger.warning(f"Error converting column {geo_col} to string: {str(e)}")
-            return df.copy()
-        # Create filters
-        filters = []
-        # Add geographic scope filter
-        if geographic_scope != "Unknown":
-            # Create a list of possible values for the geographic scope
-            scope_values = [geographic_scope.lower()]
-            # Add common abbreviations
-            abbreviations = {
-                # Canadian provinces
-                "alberta": "ab", "british columbia": "bc", "ontario": "on", "quebec": "qc",
-                "manitoba": "mb", "saskatchewan": "sk", "nova scotia": "ns", "new brunswick": "nb",
-                "prince edward island": "pe", "newfoundland": "nl", "yukon": "yt",
-                "northwest territories": "nt", "nunavut": "nu",
-                # US states
-                "alabama": "al", "alaska": "ak", "arizona": "az", "arkansas": "ar",
-                "california": "ca", "colorado": "co", "connecticut": "ct", "delaware": "de",
-                "florida": "fl", "georgia": "ga", "hawaii": "hi", "idaho": "id",
-                "illinois": "il", "indiana": "in", "iowa": "ia", "kansas": "ks",
-                "kentucky": "ky", "louisiana": "la", "maine": "me", "maryland": "md",
-                "massachusetts": "ma", "michigan": "mi", "minnesota": "mn", "mississippi": "ms",
-                "missouri": "mo", "montana": "mt", "nebraska": "ne", "nevada": "nv",
-                "new hampshire": "nh", "new jersey": "nj", "new mexico": "nm", "new york": "ny",
-                "north carolina": "nc", "north dakota": "nd", "ohio": "oh", "oklahoma": "ok",
-                "oregon": "or", "pennsylvania": "pa", "rhode island": "ri", "south carolina": "sc",
-                "south dakota": "sd", "tennessee": "tn", "texas": "tx", "utah": "ut",
-                "vermont": "vt", "virginia": "va", "washington": "wa", "west virginia": "wv",
-                "wisconsin": "wi", "wyoming": "wy"
-            }
-            if geographic_scope.lower() in abbreviations:
-                scope_values.append(abbreviations[geographic_scope.lower()])
-            try:
-                scope_filter = df[geo_col].str.lower().isin(scope_values)
-                filters.append(scope_filter)
-            except Exception as e:
-                logger.warning(f"Error creating scope filter: {str(e)}")
-        # Add region filters
-        if regions:
-            try:
-                region_filter = df[geo_col].str.lower().isin([r.lower() for r in regions])
-                filters.append(region_filter)
-            except Exception as e:
-                logger.warning(f"Error creating region filter: {str(e)}")
-        # Apply filters
-        if filters:
-            try:
-                combined_filter = filters[0]
-                for f in filters[1:]:
-                    combined_filter = combined_filter | f
-                return df[combined_filter].copy()
-            except Exception as e:
-                logger.warning(f"Error applying filters: {str(e)}")
-        return df.copy()
-    def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
-        """Analyze resource allocation patterns"""
-        results = {}
-        for data_name in relevant_data:
-            df = self.data_registry.get(data_name)
-            if df is None or df.empty:
-                continue
-            # Staff analysis
-            staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
-            if staff_col:
-                # Ensure we're working with numeric data
-                df[staff_col] = pd.to_numeric(df[staff_col], errors='coerce')
-                total_staff = df[staff_col].sum()
-                results["total_staff"] = total_staff
-                # Staff per bed ratio
-                capacity_col = self._find_column(df, ['capacity', 'beds'])
-                if capacity_col and capacity_col in df.columns:
-                    # Ensure we're working with numeric data
-                    df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
-                    df['staff_per_bed'] = df[staff_col] / df[capacity_col].replace(0, np.nan)  # Avoid division by zero
-                    avg_staff_per_bed = df['staff_per_bed'].mean()
-                    results["staff_per_bed_ratio"] = avg_staff_per_bed
-            # Equipment analysis
-            equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
-            if equipment_cols:
-                equipment_summary = {}
-                for col in equipment_cols:
-                    # Ensure we're working with numeric data
-                    df[col] = pd.to_numeric(df[col], errors='coerce')
-                    equipment_summary[col] = df[col].sum()
-                results["equipment_summary"] = equipment_summary
-        return results
-    def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
-        """Analyze trends in healthcare data"""
-        results = {}
-        for data_name in relevant_data:
-            df = self.data_registry.get(data_name)
-            if df is None or df.empty:
-                continue
-            # Find time-based columns
-            time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
-            if len(time_cols) >= 2:
-                trends = {}
-                # Calculate year-over-year changes
-                for i in range(1, len(time_cols)):
-                    prev_year = time_cols[i-1]
-                    curr_year = time_cols[i]
-                    # Ensure we're working with numeric data
-                    df[prev_year] = pd.to_numeric(df[prev_year], errors='coerce')
-                    df[curr_year] = pd.to_numeric(df[curr_year], errors='coerce')
-                    prev_total = df[prev_year].sum()
-                    curr_total = df[curr_year].sum()
-                    if prev_total > 0:  # Avoid division by zero
-                        change_pct = (curr_total - prev_total) / prev_total * 100
-                        trends[f"{prev_year}_to_{curr_year}"] = {
-                            "absolute_change": curr_total - prev_total,
-                            "percentage_change": change_pct
-                        }
-                results["year_over_year_trends"] = trends
         return results
-    def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
-        """Identify opportunities for AI integration and data enhancement"""
-        opportunities = {
-            "data_integration": [],
-            "ai_applications": [],
-            "enhanced_metrics": []
-        }
-        # Data integration opportunities
-        opportunities["data_integration"].append({
-            "opportunity": "Integrate real-time occupancy data",
-            "description": "Combine current facility data with real-time occupancy monitoring systems",
-            "benefit": "Enable dynamic resource allocation and surge planning"
-        })
-        opportunities["data_integration"].append({
-            "opportunity": "Incorporate demographic data",
-            "description": "Add population demographics and health needs data",
-            "benefit": "Improve demand forecasting and service planning"
-        })
-        # AI application opportunities
-        opportunities["ai_applications"].append({
-            "opportunity": "Predictive capacity modeling",
-            "description": "Use ML to forecast capacity needs based on trends and external factors",
-            "benefit": "Proactive resource planning and reduced wait times"
-        })
-        opportunities["ai_applications"].append({
-            "opportunity": "Optimization algorithms",
-            "description": "Implement AI for staff scheduling and resource allocation",
-            "benefit": "Improved efficiency and reduced operational costs"
-        })
-        # Enhanced metrics
-        opportunities["enhanced_metrics"].append({
-            "metric": "Patient flow efficiency",
-            "description": "Measure time from admission to discharge across facilities",
-            "benefit": "Identify bottlenecks and improve patient experience"
-        })
-        opportunities["enhanced_metrics"].append({
-            "metric": "Resource utilization index",
-            "description": "Composite metric combining staff, equipment, and space utilization",
-            "benefit": "Holistic view of operational efficiency"
-        })
-        return opportunities
-    # Helper methods
-    def _find_column(self, df, patterns):
-        """Find the first column matching any pattern"""
-        if df is None or df.empty:
-            return None
-        for col in df.columns:
-            if any(pattern.lower() in col.lower() for pattern in patterns):
-                return col
-        return None
-    def _calculate_gini(self, values):
-        """Calculate Gini coefficient for inequality measurement"""
-        if not values or len(values) < 2:
-            return 0
-        values = sorted(values)
-        n = len(values)
-        index = np.arange(1, n + 1)
-        total = np.sum(values)
-        if total == 0:
-            return 0
-        gini = (np.sum((2 * index - n - 1) * values)) / (n * total)
-        return gini
-    def _calculate_diversity_index(self, distribution):
-        """Calculate Shannon diversity index"""
-        if not distribution:
-            return 0
-        total = sum(distribution.values())
-        if total == 0:
-            return 0
-        proportions = [count/total for count in distribution.values() if count > 0]
-        if not proportions:
-            return 0
-        return -sum(p * np.log(p) for p in proportions)
-    def _extract_geographic_scope(self, text):
-        """Extract geographic scope from text"""
-        # Look for province/state names
-        provinces = [
-            "alberta", "british columbia", "ontario", "quebec", "manitoba",
-            "saskatchewan", "nova scotia", "new brunswick", "prince edward island",
-            "newfoundland", "yukon", "northwest territories", "nunavut"
-        ]
-        states = [
-            "alabama", "alaska", "arizona", "arkansas", "california", "colorado",
-            "connecticut", "delaware", "florida", "georgia", "hawaii", "idaho",
-            "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana",
-            "maine", "maryland", "massachusetts", "michigan", "minnesota",
-            "mississippi", "missouri", "montana", "nebraska", "nevada",
-            "new hampshire", "new jersey", "new mexico", "new york",
-            "north carolina", "north dakota", "ohio", "oklahoma", "oregon",
-            "pennsylvania", "rhode island", "south carolina", "south dakota",
-            "tennessee", "texas", "utah", "vermont", "virginia", "washington",
-            "west virginia", "wisconsin", "wyoming"
-        ]
-        text_lower = text.lower()
-        # Check for provinces
-        for province in provinces:
-            if province in text_lower:
-                return province.title()
-        # Check for states
-        for state in states:
-            if state in text_lower:
-                return state.title()
-        # Check for countries
-        if "canada" in text_lower:
-            return "Canada"
-        if "usa" in text_lower or "united states" in text_lower:
-            return "United States"
-        return "Unknown"
-    def _extract_time_period(self, text):
-        """Extract time period from text"""
-        # Look for year patterns
-        years = re.findall(r'\b(20\d{2})\b', text)
-        if len(years) >= 2:
-            return f"{min(years)}-{max(years)}"
-        return "Unknown"
-    def _extract_facility_types(self, text):
-        """Extract facility types from text"""
-        types = []
-        if "hospital" in text.lower():
-            types.append("Hospitals")
-        if "nursing" in text.lower() or "long-term" in text.lower():
-            types.append("Nursing homes")
-        if "clinic" in text.lower():
-            types.append("Clinics")
-        return types
-    def _extract_metrics(self, text):
-        """Extract required metrics from text"""
-        metrics = []
-        if "bed" in text.lower():
-            metrics.append("Bed capacity")
-        if "occupancy" in text.lower():
-            metrics.append("Occupancy rates")
-        if "staff" in text.lower():
-            metrics.append("Staffing levels")
-        return metrics
-    def _extract_regions(self, text):
-        """Extract specific regions mentioned in the scenario"""
-        # Look for region names in the scenario
-        regions = []
-        # Common region patterns - this could be expanded
-        region_patterns = [
-            r'([A-Z][a-z]+ (Zone|Region|Area|District))',
-            r'(North|South|East|West|Central)',
-            r'([A-Z][a-z]+ (City|County|State|Province))',
-            r'([A-Z][a-z]+)'
-        ]
-        for pattern in region_patterns:
-            matches = re.findall(pattern, text)
-            for match in matches:
-                if isinstance(match, tuple):
-                    regions.append(match[0])
-                else:
-                    regions.append(match)
-        # Remove duplicates while preserving order
-        seen = set()
-        unique_regions = [r for r in regions if not (r in seen or seen.add(r))]
-        return unique_regions
-    def _identify_relevant_data(self, text):
-        """Identify relevant datasets for the scenario"""
-        # Use data registry's find_related_datasets method
-        keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
-        return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]

 # healthcare_analysis.py
 import pandas as pd
+from data_registry import DataRegistry
 class HealthcareAnalyzer:
+    def __init__(self, registry: DataRegistry):
+        self.registry = registry
+    def comprehensive_analysis(self, scenario: str) -> dict:
+        results={}
+        for name in self.registry.names():
+            results[name]=self.registry.get(name)
         return results