Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 23

Commit

340e032

verified ·

1 Parent(s): 6c051db

Update healthcare_analysis.py

Browse files

Files changed (1) hide show

healthcare_analysis.py +87 -62

healthcare_analysis.py CHANGED Viewed

@@ -82,11 +82,12 @@ class HealthcareAnalyzer:
         # Look for region names in the scenario
         regions = []
-        # Common region patterns - this could be expanded
         region_patterns = [
             r'([A-Z][a-z]+ (Zone|Region|Area|District))',
-            r'(North|South|East|West|Central|Calgary|Edmonton|Toronto|Vancouver|Montreal)',
-            r'(Alberta|British Columbia|Ontario|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|PEI|Newfoundland|Yukon|NWT|Nunavut)'
         ]
         import re
@@ -115,13 +116,13 @@ class HealthcareAnalyzer:
             if df is None or df.empty:
                 continue
-            # Filter data based on geographic scope
             filtered_df = self._filter_by_geography(df, geographic_scope, regions)
             if filtered_df.empty:
                 continue
-            # Facility type distribution
             type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
             if type_col:
                 # Ensure we're working with string data
@@ -133,7 +134,7 @@ class HealthcareAnalyzer:
                 diversity = self._calculate_diversity_index(type_dist)
                 results["facility_diversity"] = diversity
-            # Geographic distribution
             geo_col = self._find_column(filtered_df, ['province', 'state', 'region', 'zone', 'area'])
             if geo_col:
                 # Ensure we're working with string data
@@ -145,7 +146,7 @@ class HealthcareAnalyzer:
                 gini = self._calculate_gini(list(geo_dist.values()))
                 results["geographic_inequality"] = gini
-            # City distribution
             city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
             if city_col:
                 # Ensure we're working with string data
@@ -179,13 +180,13 @@ class HealthcareAnalyzer:
             if df is None or df.empty:
                 continue
-            # Filter data based on geographic scope
             filtered_df = self._filter_by_geography(df, geographic_scope, regions)
             if filtered_df.empty:
                 continue
-            # Current capacity
             capacity_col = self._find_column(filtered_df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
             if capacity_col:
                 # Ensure we're working with numeric data
@@ -212,7 +213,7 @@ class HealthcareAnalyzer:
                         utilization_by_type = filtered_df.groupby(type_col)[utilization_col].mean().to_dict()
                         results["utilization_by_type"] = utilization_by_type
-                # Capacity trends
                 time_cols = [col for col in filtered_df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
                 if len(time_cols) >= 2:
                     trend_data = {}
@@ -230,7 +231,7 @@ class HealthcareAnalyzer:
                             growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
                             results["capacity_growth_rate"] = growth_rate
-            # Bed change analysis
             prev_col = self._find_column(filtered_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
             current_col = self._find_column(filtered_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
@@ -248,7 +249,7 @@ class HealthcareAnalyzer:
                     axis=1
                 )
-                # Zone/Region-level analysis
                 zone_col = self._find_column(filtered_df, ['zone', 'region', 'area', 'district'])
                 if zone_col:
                     # Ensure we're working with string data
@@ -289,67 +290,86 @@ class HealthcareAnalyzer:
         return results
     def _filter_by_geography(self, df: pd.DataFrame, geographic_scope: str, regions: List[str]) -> pd.DataFrame:
-        """Filter dataframe based on geographic scope and regions"""
         if geographic_scope == "Unknown" and not regions:
             return df.copy()
-        # Try to find a geographic column
         geo_col = self._find_column(df, ['province', 'state', 'region', 'zone', 'area', 'district'])
         if geo_col is None:
             return df.copy()
         # Ensure we're working with string data
-        df[geo_col] = df[geo_col].astype(str)
         # Create filters
         filters = []
-        # Add geographic scope filter
         if geographic_scope != "Unknown":
             # Create a list of possible values for the geographic scope
             scope_values = [geographic_scope.lower()]
-            # Add common abbreviations
             abbreviations = {
-                "alberta": "ab",
-                "british columbia": "bc",
-                "ontario": "on",
-                "quebec": "qc",
-                "manitoba": "mb",
-                "saskatchewan": "sk",
-                "nova scotia": "ns",
-                "new brunswick": "nb",
-                "prince edward island": "pe",
-                "newfoundland": "nl",
-                "yukon": "yt",
-                "northwest territories": "nt",
-                "nunavut": "nu"
             }
             if geographic_scope.lower() in abbreviations:
                 scope_values.append(abbreviations[geographic_scope.lower()])
-            scope_filter = df[geo_col].str.lower().isin(scope_values)
-            filters.append(scope_filter)
-        # Add region filters
         if regions:
-            region_filter = df[geo_col].str.lower().isin([r.lower() for r in regions])
-            filters.append(region_filter)
         # Apply filters
         if filters:
-            combined_filter = filters[0]
-            for f in filters[1:]:
-                combined_filter = combined_filter | f
-            return df[combined_filter].copy()
         return df.copy()
     def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
-        """Analyze resource allocation patterns"""
         results = {}
         for data_name in relevant_data:
@@ -357,7 +377,7 @@ class HealthcareAnalyzer:
             if df is None or df.empty:
                 continue
-            # Staff analysis
             staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
             if staff_col:
                 # Ensure we're working with numeric data
@@ -374,7 +394,7 @@ class HealthcareAnalyzer:
                     avg_staff_per_bed = df['staff_per_bed'].mean()
                     results["staff_per_bed_ratio"] = avg_staff_per_bed
-            # Equipment analysis
             equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
             if equipment_cols:
                 equipment_summary = {}
@@ -387,7 +407,7 @@ class HealthcareAnalyzer:
         return results
     def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
-        """Analyze trends in healthcare data"""
         results = {}
         for data_name in relevant_data:
@@ -395,7 +415,7 @@ class HealthcareAnalyzer:
             if df is None or df.empty:
                 continue
-            # Find time-based columns
             time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
             if len(time_cols) >= 2:
@@ -425,7 +445,7 @@ class HealthcareAnalyzer:
         return results
     def generate_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
-        """Generate data-driven operational recommendations"""
         recommendations = []
         geographic_scope = requirements.get("geographic_scope", "the region")
@@ -451,16 +471,21 @@ class HealthcareAnalyzer:
                     "data_source": "Capacity trend analysis"
                 })
-            # Zone-specific recommendations
             if "max_percentage_decrease" in capacity and isinstance(capacity["max_percentage_decrease"], dict):
-                zone_col = capacity.get("columns_used", {}).get("zone")
-                zone = capacity["max_percentage_decrease"].get(zone_col, 'a zone') if zone_col else 'a zone'
                 decrease = capacity["max_percentage_decrease"].get("percent_change", 0)
-                if zone and decrease:
                     recommendations.append({
-                        "title": f"Address Capacity Decline in {zone}",
-                        "description": f"{zone} shows a {decrease:.1f}% decrease in bed capacity. Investigate causes and implement recovery strategies.",
                         "priority": "High",
                         "data_source": "Zone capacity analysis"
                     })
@@ -496,7 +521,7 @@ class HealthcareAnalyzer:
         return recommendations
     def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
-        """Identify opportunities for AI integration and data enhancement"""
         opportunities = {
             "data_integration": [],
             "ai_applications": [],
@@ -546,7 +571,7 @@ class HealthcareAnalyzer:
     # Helper methods
     def _find_column(self, df, patterns):
-        """Find the first column matching any pattern"""
         if df is None or df.empty:
             return None
         for col in df.columns:
@@ -555,7 +580,7 @@ class HealthcareAnalyzer:
         return None
     def _calculate_gini(self, values):
-        """Calculate Gini coefficient for inequality measurement"""
         if not values or len(values) < 2:
             return 0
@@ -571,7 +596,7 @@ class HealthcareAnalyzer:
         return gini
     def _calculate_diversity_index(self, distribution):
-        """Calculate Shannon diversity index"""
         if not distribution:
             return 0
@@ -586,8 +611,8 @@ class HealthcareAnalyzer:
         return -sum(p * np.log(p) for p in proportions)
     def _extract_geographic_scope(self, text):
-        """Extract geographic scope from text"""
-        # Look for province/state names
         provinces = [
             "alberta", "british columbia", "ontario", "quebec", "manitoba",
             "saskatchewan", "nova scotia", "new brunswick", "prince edward island",
@@ -628,7 +653,7 @@ class HealthcareAnalyzer:
         return "Unknown"
     def _extract_time_period(self, text):
-        """Extract time period from text"""
         # Look for year patterns
         import re
         years = re.findall(r'\b(20\d{2})\b', text)
@@ -637,7 +662,7 @@ class HealthcareAnalyzer:
         return "Unknown"
     def _extract_facility_types(self, text):
-        """Extract facility types from text"""
         types = []
         if "hospital" in text.lower():
             types.append("Hospitals")
@@ -648,7 +673,7 @@ class HealthcareAnalyzer:
         return types
     def _extract_metrics(self, text):
-        """Extract required metrics from text"""
         metrics = []
         if "bed" in text.lower():
             metrics.append("Bed capacity")
@@ -659,7 +684,7 @@ class HealthcareAnalyzer:
         return metrics
     def _identify_relevant_data(self, text):
-        """Identify relevant datasets for the scenario"""
         # Use data registry's find_related_datasets method
         keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
         return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]

         # Look for region names in the scenario
         regions = []
+        # Generic region patterns - works for any healthcare scenario
         region_patterns = [
             r'([A-Z][a-z]+ (Zone|Region|Area|District))',
+            r'(North|South|East|West|Central)',
+            r'([A-Z][a-z]+ (City|County|State|Province))',
+            r'([A-Z][a-z]+)'
         ]
         import re
             if df is None or df.empty:
                 continue
+            # Filter data based on geographic scope - generic approach
             filtered_df = self._filter_by_geography(df, geographic_scope, regions)
             if filtered_df.empty:
                 continue
+            # Facility type distribution - generic column finding
             type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
             if type_col:
                 # Ensure we're working with string data
                 diversity = self._calculate_diversity_index(type_dist)
                 results["facility_diversity"] = diversity
+            # Geographic distribution - generic column finding
             geo_col = self._find_column(filtered_df, ['province', 'state', 'region', 'zone', 'area'])
             if geo_col:
                 # Ensure we're working with string data
                 gini = self._calculate_gini(list(geo_dist.values()))
                 results["geographic_inequality"] = gini
+            # City distribution - generic column finding
             city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
             if city_col:
                 # Ensure we're working with string data
             if df is None or df.empty:
                 continue
+            # Filter data based on geographic scope - generic approach
             filtered_df = self._filter_by_geography(df, geographic_scope, regions)
             if filtered_df.empty:
                 continue
+            # Current capacity - generic column finding
             capacity_col = self._find_column(filtered_df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
             if capacity_col:
                 # Ensure we're working with numeric data
                         utilization_by_type = filtered_df.groupby(type_col)[utilization_col].mean().to_dict()
                         results["utilization_by_type"] = utilization_by_type
+                # Capacity trends - generic approach for time columns
                 time_cols = [col for col in filtered_df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
                 if len(time_cols) >= 2:
                     trend_data = {}
                             growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
                             results["capacity_growth_rate"] = growth_rate
+            # Bed change analysis - generic column finding
             prev_col = self._find_column(filtered_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
             current_col = self._find_column(filtered_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
                     axis=1
                 )
+                # Zone/Region-level analysis - generic column finding
                 zone_col = self._find_column(filtered_df, ['zone', 'region', 'area', 'district'])
                 if zone_col:
                     # Ensure we're working with string data
         return results
     def _filter_by_geography(self, df: pd.DataFrame, geographic_scope: str, regions: List[str]) -> pd.DataFrame:
+        """Filter dataframe based on geographic scope and regions - generic approach"""
         if geographic_scope == "Unknown" and not regions:
             return df.copy()
+        # Try to find a geographic column - generic approach
         geo_col = self._find_column(df, ['province', 'state', 'region', 'zone', 'area', 'district'])
         if geo_col is None:
             return df.copy()
         # Ensure we're working with string data
+        try:
+            df[geo_col] = df[geo_col].astype(str)
+        except Exception as e:
+            logger.warning(f"Error converting column {geo_col} to string: {str(e)}")
+            return df.copy()
         # Create filters
         filters = []
+        # Add geographic scope filter - generic approach
         if geographic_scope != "Unknown":
             # Create a list of possible values for the geographic scope
             scope_values = [geographic_scope.lower()]
+            # Add common abbreviations - generic for any region
             abbreviations = {
+                # Canadian provinces
+                "alberta": "ab", "british columbia": "bc", "ontario": "on", "quebec": "qc",
+                "manitoba": "mb", "saskatchewan": "sk", "nova scotia": "ns", "new brunswick": "nb",
+                "prince edward island": "pe", "newfoundland": "nl", "yukon": "yt",
+                "northwest territories": "nt", "nunavut": "nu",
+                # US states
+                "alabama": "al", "alaska": "ak", "arizona": "az", "arkansas": "ar",
+                "california": "ca", "colorado": "co", "connecticut": "ct", "delaware": "de",
+                "florida": "fl", "georgia": "ga", "hawaii": "hi", "idaho": "id",
+                "illinois": "il", "indiana": "in", "iowa": "ia", "kansas": "ks",
+                "kentucky": "ky", "louisiana": "la", "maine": "me", "maryland": "md",
+                "massachusetts": "ma", "michigan": "mi", "minnesota": "mn", "mississippi": "ms",
+                "missouri": "mo", "montana": "mt", "nebraska": "ne", "nevada": "nv",
+                "new hampshire": "nh", "new jersey": "nj", "new mexico": "nm", "new york": "ny",
+                "north carolina": "nc", "north dakota": "nd", "ohio": "oh", "oklahoma": "ok",
+                "oregon": "or", "pennsylvania": "pa", "rhode island": "ri", "south carolina": "sc",
+                "south dakota": "sd", "tennessee": "tn", "texas": "tx", "utah": "ut",
+                "vermont": "vt", "virginia": "va", "washington": "wa", "west virginia": "wv",
+                "wisconsin": "wi", "wyoming": "wy"
             }
             if geographic_scope.lower() in abbreviations:
                 scope_values.append(abbreviations[geographic_scope.lower()])
+            try:
+                scope_filter = df[geo_col].str.lower().isin(scope_values)
+                filters.append(scope_filter)
+            except Exception as e:
+                logger.warning(f"Error creating scope filter: {str(e)}")
+        # Add region filters - generic approach
         if regions:
+            try:
+                region_filter = df[geo_col].str.lower().isin([r.lower() for r in regions])
+                filters.append(region_filter)
+            except Exception as e:
+                logger.warning(f"Error creating region filter: {str(e)}")
         # Apply filters
         if filters:
+            try:
+                combined_filter = filters[0]
+                for f in filters[1:]:
+                    combined_filter = combined_filter | f
+                return df[combined_filter].copy()
+            except Exception as e:
+                logger.warning(f"Error applying filters: {str(e)}")
         return df.copy()
     def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
+        """Analyze resource allocation patterns - generic approach"""
         results = {}
         for data_name in relevant_data:
             if df is None or df.empty:
                 continue
+            # Staff analysis - generic column finding
             staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
             if staff_col:
                 # Ensure we're working with numeric data
                     avg_staff_per_bed = df['staff_per_bed'].mean()
                     results["staff_per_bed_ratio"] = avg_staff_per_bed
+            # Equipment analysis - generic approach
             equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
             if equipment_cols:
                 equipment_summary = {}
         return results
     def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
+        """Analyze trends in healthcare data - generic approach"""
         results = {}
         for data_name in relevant_data:
             if df is None or df.empty:
                 continue
+            # Find time-based columns - generic approach
             time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
             if len(time_cols) >= 2:
         return results
     def generate_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
+        """Generate data-driven operational recommendations - generic approach"""
         recommendations = []
         geographic_scope = requirements.get("geographic_scope", "the region")
                     "data_source": "Capacity trend analysis"
                 })
+            # Zone-specific recommendations - generic approach
             if "max_percentage_decrease" in capacity and isinstance(capacity["max_percentage_decrease"], dict):
+                # Try to find the zone name using multiple possible keys
+                zone_name = "a zone"
+                for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
+                    if key in capacity["max_percentage_decrease"]:
+                        zone_name = capacity["max_percentage_decrease"][key]
+                        break
                 decrease = capacity["max_percentage_decrease"].get("percent_change", 0)
+                if zone_name and decrease:
                     recommendations.append({
+                        "title": f"Address Capacity Decline in {zone_name}",
+                        "description": f"{zone_name} shows a {decrease:.1f}% decrease in bed capacity. Investigate causes and implement recovery strategies.",
                         "priority": "High",
                         "data_source": "Zone capacity analysis"
                     })
         return recommendations
     def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Identify opportunities for AI integration and data enhancement - generic approach"""
         opportunities = {
             "data_integration": [],
             "ai_applications": [],
     # Helper methods
     def _find_column(self, df, patterns):
+        """Find the first column matching any pattern - generic approach"""
         if df is None or df.empty:
             return None
         for col in df.columns:
         return None
     def _calculate_gini(self, values):
+        """Calculate Gini coefficient for inequality measurement - generic approach"""
         if not values or len(values) < 2:
             return 0
         return gini
     def _calculate_diversity_index(self, distribution):
+        """Calculate Shannon diversity index - generic approach"""
         if not distribution:
             return 0
         return -sum(p * np.log(p) for p in proportions)
     def _extract_geographic_scope(self, text):
+        """Extract geographic scope from text - generic approach"""
+        # Look for province/state names - generic for any region
         provinces = [
             "alberta", "british columbia", "ontario", "quebec", "manitoba",
             "saskatchewan", "nova scotia", "new brunswick", "prince edward island",
         return "Unknown"
     def _extract_time_period(self, text):
+        """Extract time period from text - generic approach"""
         # Look for year patterns
         import re
         years = re.findall(r'\b(20\d{2})\b', text)
         return "Unknown"
     def _extract_facility_types(self, text):
+        """Extract facility types from text - generic approach"""
         types = []
         if "hospital" in text.lower():
             types.append("Hospitals")
         return types
     def _extract_metrics(self, text):
+        """Extract required metrics from text - generic approach"""
         metrics = []
         if "bed" in text.lower():
             metrics.append("Bed capacity")
         return metrics
     def _identify_relevant_data(self, text):
+        """Identify relevant datasets for the scenario - generic approach"""
         # Use data registry's find_related_datasets method
         keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
         return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]