Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 23

Commit

399738b

verified ·

1 Parent(s): 1cc2098

Update healthcare_analysis.py

Browse files

Files changed (1) hide show

healthcare_analysis.py +142 -21

healthcare_analysis.py CHANGED Viewed

@@ -78,22 +78,30 @@ class HealthcareAnalyzer:
         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
-            if df is None:
                 continue
             # Geographic distribution
             geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
             if geo_col:
-                geo_dist = df[geo_col].value_counts().to_dict()
-                results["geographic_distribution"] = geo_dist
-                # Calculate Gini coefficient for inequality
-                gini = self._calculate_gini(list(geo_dist.values()))
-                results["geographic_inequality"] = gini
             # Facility type distribution
-            type_col = self._find_column(df, ['type', 'category', 'facility_type'])
             if type_col:
                 type_dist = df[type_col].value_counts().to_dict()
                 results["facility_type_distribution"] = type_dist
@@ -104,8 +112,31 @@ class HealthcareAnalyzer:
             # Urban vs rural distribution
             urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
             if urban_col:
                 urban_rural = df[urban_col].value_counts().to_dict()
                 results["urban_rural_distribution"] = urban_rural
         return results
@@ -115,29 +146,33 @@ class HealthcareAnalyzer:
         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
-            if df is None:
                 continue
             # Current capacity
-            capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity'])
             if capacity_col:
                 total_capacity = df[capacity_col].sum()
                 results["total_capacity"] = total_capacity
                 # Capacity by facility type
                 type_col = self._find_column(df, ['type', 'facility_type'])
-                if type_col:
                     capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
                     results["capacity_by_type"] = capacity_by_type
                 # Capacity utilization
                 utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
                 if utilization_col:
                     avg_utilization = df[utilization_col].mean()
                     results["average_utilization"] = avg_utilization
                     # Utilization by facility type
-                    if type_col:
                         utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
                         results["utilization_by_type"] = utilization_by_type
@@ -146,6 +181,8 @@ class HealthcareAnalyzer:
                 if len(time_cols) >= 2:
                     trend_data = {}
                     for col in time_cols:
                         trend_data[col] = df[col].sum()
                     results["capacity_trends"] = trend_data
@@ -153,8 +190,65 @@ class HealthcareAnalyzer:
                     if len(time_cols) >= 2:
                         latest = time_cols[-1]
                         earliest = time_cols[0]
-                        growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
-                        results["capacity_growth_rate"] = growth_rate
         return results
@@ -164,19 +258,23 @@ class HealthcareAnalyzer:
         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
-            if df is None:
                 continue
             # Staff analysis
             staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
             if staff_col:
                 total_staff = df[staff_col].sum()
                 results["total_staff"] = total_staff
                 # Staff per bed ratio
                 capacity_col = self._find_column(df, ['capacity', 'beds'])
-                if capacity_col:
-                    df['staff_per_bed'] = df[staff_col] / df[capacity_col]
                     avg_staff_per_bed = df['staff_per_bed'].mean()
                     results["staff_per_bed_ratio"] = avg_staff_per_bed
@@ -185,6 +283,8 @@ class HealthcareAnalyzer:
             if equipment_cols:
                 equipment_summary = {}
                 for col in equipment_cols:
                     equipment_summary[col] = df[col].sum()
                 results["equipment_summary"] = equipment_summary
@@ -196,7 +296,7 @@ class HealthcareAnalyzer:
         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
-            if df is None:
                 continue
             # Find time-based columns
@@ -210,10 +310,14 @@ class HealthcareAnalyzer:
                     prev_year = time_cols[i-1]
                     curr_year = time_cols[i]
                     prev_total = df[prev_year].sum()
                     curr_total = df[curr_year].sum()
-                    if prev_total > 0:
                         change_pct = (curr_total - prev_total) / prev_total * 100
                         trends[f"{prev_year}_to_{curr_year}"] = {
                             "absolute_change": curr_total - prev_total,
@@ -332,6 +436,8 @@ class HealthcareAnalyzer:
     # Helper methods
     def _find_column(self, df, patterns):
         """Find the first column matching any pattern"""
         for col in df.columns:
             if any(pattern.lower() in col.lower() for pattern in patterns):
                 return col
@@ -339,19 +445,34 @@ class HealthcareAnalyzer:
     def _calculate_gini(self, values):
         """Calculate Gini coefficient for inequality measurement"""
         values = sorted(values)
         n = len(values)
         index = np.arange(1, n + 1)
-        gini = (np.sum((2 * index - n - 1) * values)) / (n * np.sum(values))
         return gini
     def _calculate_diversity_index(self, distribution):
         """Calculate Shannon diversity index"""
         total = sum(distribution.values())
         if total == 0:
             return 0
-        proportions = [count/total for count in distribution.values()]
-        return -sum(p * np.log(p) for p in proportions if p > 0)
     def _extract_geographic_scope(self, text):
         """Extract geographic scope from text"""

         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
+            if df is None or df.empty:
                 continue
             # Geographic distribution
             geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
             if geo_col:
+                # Ensure we're working with string data
+                df[geo_col] = df[geo_col].astype(str)
+                alberta_mask = df[geo_col].str.lower().isin(['alberta', 'ab'])
+                ab_facilities = df[alberta_mask].copy()
+                if not ab_facilities.empty:
+                    geo_dist = ab_facilities[geo_col].value_counts().to_dict()
+                    results["geographic_distribution"] = geo_dist
+                    # Calculate Gini coefficient for inequality
+                    gini = self._calculate_gini(list(geo_dist.values()))
+                    results["geographic_inequality"] = gini
             # Facility type distribution
+            type_col = self._find_column(df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
             if type_col:
+                # Ensure we're working with string data
+                df[type_col] = df[type_col].astype(str)
                 type_dist = df[type_col].value_counts().to_dict()
                 results["facility_type_distribution"] = type_dist
             # Urban vs rural distribution
             urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
             if urban_col:
+                # Ensure we're working with string data
+                df[urban_col] = df[urban_col].astype(str)
                 urban_rural = df[urban_col].value_counts().to_dict()
                 results["urban_rural_distribution"] = urban_rural
+            # City distribution
+            city_col = self._find_column(df, ['city', 'municipality', 'town'])
+            if city_col:
+                # Ensure we're working with string data
+                df[city_col] = df[city_col].astype(str)
+                city_counts = df[city_col].value_counts().head(5)
+                top_cities = city_counts.index.tolist()
+                # Breakdown by facility type for top cities
+                city_breakdown = {}
+                for city in top_cities:
+                    city_data = df[df[city_col] == city]
+                    if not city_data.empty and type_col in city_data.columns:
+                        city_breakdown[city] = city_data[type_col].value_counts().to_dict()
+                results["top_cities"] = top_cities
+                results["city_breakdown"] = city_breakdown
+                # Total facilities count
+                results["total_facilities"] = len(df)
         return results
         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
+            if df is None or df.empty:
                 continue
             # Current capacity
+            capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
             if capacity_col:
+                # Ensure we're working with numeric data
+                df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
                 total_capacity = df[capacity_col].sum()
                 results["total_capacity"] = total_capacity
                 # Capacity by facility type
                 type_col = self._find_column(df, ['type', 'facility_type'])
+                if type_col and type_col in df.columns:
                     capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
                     results["capacity_by_type"] = capacity_by_type
                 # Capacity utilization
                 utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
                 if utilization_col:
+                    # Ensure we're working with numeric data
+                    df[utilization_col] = pd.to_numeric(df[utilization_col], errors='coerce')
                     avg_utilization = df[utilization_col].mean()
                     results["average_utilization"] = avg_utilization
                     # Utilization by facility type
+                    if type_col and type_col in df.columns:
                         utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
                         results["utilization_by_type"] = utilization_by_type
                 if len(time_cols) >= 2:
                     trend_data = {}
                     for col in time_cols:
+                        # Ensure we're working with numeric data
+                        df[col] = pd.to_numeric(df[col], errors='coerce')
                         trend_data[col] = df[col].sum()
                     results["capacity_trends"] = trend_data
                     if len(time_cols) >= 2:
                         latest = time_cols[-1]
                         earliest = time_cols[0]
+                        if trend_data[earliest] > 0:  # Avoid division by zero
+                            growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
+                            results["capacity_growth_rate"] = growth_rate
+            # Bed change analysis
+            prev_col = self._find_column(df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
+            current_col = self._find_column(df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
+            if prev_col and current_col:
+                # Ensure we're working with numeric data
+                df[prev_col] = pd.to_numeric(df[prev_col], errors='coerce')
+                df[current_col] = pd.to_numeric(df[current_col], errors='coerce')
+                # Calculate bed change
+                df['bed_change'] = df[current_col] - df[prev_col]
+                # Calculate percentage change
+                df['percent_change'] = df.apply(
+                    lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
+                    axis=1
+                )
+                # Zone-level analysis
+                zone_col = self._find_column(df, ['zone', 'region', 'area', 'district'])
+                if zone_col:
+                    # Ensure we're working with string data
+                    df[zone_col] = df[zone_col].astype(str)
+                    zone_summary = df.groupby(zone_col).agg({
+                        current_col: 'sum',
+                        prev_col: 'sum',
+                        'bed_change': 'sum'
+                    }).reset_index()
+                    zone_summary['percent_change'] = zone_summary.apply(
+                        lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
+                        axis=1
+                    )
+                    results["zone_summary"] = zone_summary.to_dict('records')
+                    # Find zones with largest changes
+                    if not zone_summary.empty:
+                        # Get zone with largest absolute decrease
+                        if zone_summary['bed_change'].notna().any():
+                            max_abs_decrease_idx = zone_summary['bed_change'].idxmin()
+                            max_abs_decrease = zone_summary.loc[max_abs_decrease_idx]
+                            results["max_absolute_decrease"] = max_abs_decrease.to_dict()
+                        # Get zone with largest percentage decrease
+                        if zone_summary['percent_change'].notna().any():
+                            max_pct_decrease_idx = zone_summary['percent_change'].idxmin()
+                            max_pct_decrease = zone_summary.loc[max_pct_decrease_idx]
+                            results["max_percentage_decrease"] = max_pct_decrease.to_dict()
+                    # Identify facilities with largest declines
+                    facilities_decline = df.sort_values('bed_change').head(5)
+                    if not facilities_decline.empty:
+                        results["facilities_with_largest_declines"] = facilities_decline.to_dict('records')
         return results
         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
+            if df is None or df.empty:
                 continue
             # Staff analysis
             staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
             if staff_col:
+                # Ensure we're working with numeric data
+                df[staff_col] = pd.to_numeric(df[staff_col], errors='coerce')
                 total_staff = df[staff_col].sum()
                 results["total_staff"] = total_staff
                 # Staff per bed ratio
                 capacity_col = self._find_column(df, ['capacity', 'beds'])
+                if capacity_col and capacity_col in df.columns:
+                    # Ensure we're working with numeric data
+                    df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
+                    df['staff_per_bed'] = df[staff_col] / df[capacity_col].replace(0, np.nan)  # Avoid division by zero
                     avg_staff_per_bed = df['staff_per_bed'].mean()
                     results["staff_per_bed_ratio"] = avg_staff_per_bed
             if equipment_cols:
                 equipment_summary = {}
                 for col in equipment_cols:
+                    # Ensure we're working with numeric data
+                    df[col] = pd.to_numeric(df[col], errors='coerce')
                     equipment_summary[col] = df[col].sum()
                 results["equipment_summary"] = equipment_summary
         for data_name in relevant_data:
             df = self.data_registry.get(data_name)
+            if df is None or df.empty:
                 continue
             # Find time-based columns
                     prev_year = time_cols[i-1]
                     curr_year = time_cols[i]
+                    # Ensure we're working with numeric data
+                    df[prev_year] = pd.to_numeric(df[prev_year], errors='coerce')
+                    df[curr_year] = pd.to_numeric(df[curr_year], errors='coerce')
                     prev_total = df[prev_year].sum()
                     curr_total = df[curr_year].sum()
+                    if prev_total > 0:  # Avoid division by zero
                         change_pct = (curr_total - prev_total) / prev_total * 100
                         trends[f"{prev_year}_to_{curr_year}"] = {
                             "absolute_change": curr_total - prev_total,
     # Helper methods
     def _find_column(self, df, patterns):
         """Find the first column matching any pattern"""
+        if df is None or df.empty:
+            return None
         for col in df.columns:
             if any(pattern.lower() in col.lower() for pattern in patterns):
                 return col
     def _calculate_gini(self, values):
         """Calculate Gini coefficient for inequality measurement"""
+        if not values or len(values) < 2:
+            return 0
         values = sorted(values)
         n = len(values)
         index = np.arange(1, n + 1)
+        total = np.sum(values)
+        if total == 0:
+            return 0
+        gini = (np.sum((2 * index - n - 1) * values)) / (n * total)
         return gini
     def _calculate_diversity_index(self, distribution):
         """Calculate Shannon diversity index"""
+        if not distribution:
+            return 0
         total = sum(distribution.values())
         if total == 0:
             return 0
+        proportions = [count/total for count in distribution.values() if count > 0]
+        if not proportions:
+            return 0
+        return -sum(p * np.log(p) for p in proportions)
     def _extract_geographic_scope(self, text):
         """Extract geographic scope from text"""