Spaces:
Sleeping
Sleeping
Rajan Sharma
commited on
Update healthcare_analysis.py
Browse files- healthcare_analysis.py +142 -21
healthcare_analysis.py
CHANGED
|
@@ -78,22 +78,30 @@ class HealthcareAnalyzer:
|
|
| 78 |
|
| 79 |
for data_name in relevant_data:
|
| 80 |
df = self.data_registry.get(data_name)
|
| 81 |
-
if df is None:
|
| 82 |
continue
|
| 83 |
|
| 84 |
# Geographic distribution
|
| 85 |
geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
|
| 86 |
if geo_col:
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# Facility type distribution
|
| 95 |
-
type_col = self._find_column(df, ['type', 'category', 'facility_type'])
|
| 96 |
if type_col:
|
|
|
|
|
|
|
| 97 |
type_dist = df[type_col].value_counts().to_dict()
|
| 98 |
results["facility_type_distribution"] = type_dist
|
| 99 |
|
|
@@ -104,8 +112,31 @@ class HealthcareAnalyzer:
|
|
| 104 |
# Urban vs rural distribution
|
| 105 |
urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
|
| 106 |
if urban_col:
|
|
|
|
|
|
|
| 107 |
urban_rural = df[urban_col].value_counts().to_dict()
|
| 108 |
results["urban_rural_distribution"] = urban_rural
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
return results
|
| 111 |
|
|
@@ -115,29 +146,33 @@ class HealthcareAnalyzer:
|
|
| 115 |
|
| 116 |
for data_name in relevant_data:
|
| 117 |
df = self.data_registry.get(data_name)
|
| 118 |
-
if df is None:
|
| 119 |
continue
|
| 120 |
|
| 121 |
# Current capacity
|
| 122 |
-
capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity'])
|
| 123 |
if capacity_col:
|
|
|
|
|
|
|
| 124 |
total_capacity = df[capacity_col].sum()
|
| 125 |
results["total_capacity"] = total_capacity
|
| 126 |
|
| 127 |
# Capacity by facility type
|
| 128 |
type_col = self._find_column(df, ['type', 'facility_type'])
|
| 129 |
-
if type_col:
|
| 130 |
capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
|
| 131 |
results["capacity_by_type"] = capacity_by_type
|
| 132 |
|
| 133 |
# Capacity utilization
|
| 134 |
utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
|
| 135 |
if utilization_col:
|
|
|
|
|
|
|
| 136 |
avg_utilization = df[utilization_col].mean()
|
| 137 |
results["average_utilization"] = avg_utilization
|
| 138 |
|
| 139 |
# Utilization by facility type
|
| 140 |
-
if type_col:
|
| 141 |
utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
|
| 142 |
results["utilization_by_type"] = utilization_by_type
|
| 143 |
|
|
@@ -146,6 +181,8 @@ class HealthcareAnalyzer:
|
|
| 146 |
if len(time_cols) >= 2:
|
| 147 |
trend_data = {}
|
| 148 |
for col in time_cols:
|
|
|
|
|
|
|
| 149 |
trend_data[col] = df[col].sum()
|
| 150 |
results["capacity_trends"] = trend_data
|
| 151 |
|
|
@@ -153,8 +190,65 @@ class HealthcareAnalyzer:
|
|
| 153 |
if len(time_cols) >= 2:
|
| 154 |
latest = time_cols[-1]
|
| 155 |
earliest = time_cols[0]
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
return results
|
| 160 |
|
|
@@ -164,19 +258,23 @@ class HealthcareAnalyzer:
|
|
| 164 |
|
| 165 |
for data_name in relevant_data:
|
| 166 |
df = self.data_registry.get(data_name)
|
| 167 |
-
if df is None:
|
| 168 |
continue
|
| 169 |
|
| 170 |
# Staff analysis
|
| 171 |
staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
|
| 172 |
if staff_col:
|
|
|
|
|
|
|
| 173 |
total_staff = df[staff_col].sum()
|
| 174 |
results["total_staff"] = total_staff
|
| 175 |
|
| 176 |
# Staff per bed ratio
|
| 177 |
capacity_col = self._find_column(df, ['capacity', 'beds'])
|
| 178 |
-
if capacity_col:
|
| 179 |
-
|
|
|
|
|
|
|
| 180 |
avg_staff_per_bed = df['staff_per_bed'].mean()
|
| 181 |
results["staff_per_bed_ratio"] = avg_staff_per_bed
|
| 182 |
|
|
@@ -185,6 +283,8 @@ class HealthcareAnalyzer:
|
|
| 185 |
if equipment_cols:
|
| 186 |
equipment_summary = {}
|
| 187 |
for col in equipment_cols:
|
|
|
|
|
|
|
| 188 |
equipment_summary[col] = df[col].sum()
|
| 189 |
results["equipment_summary"] = equipment_summary
|
| 190 |
|
|
@@ -196,7 +296,7 @@ class HealthcareAnalyzer:
|
|
| 196 |
|
| 197 |
for data_name in relevant_data:
|
| 198 |
df = self.data_registry.get(data_name)
|
| 199 |
-
if df is None:
|
| 200 |
continue
|
| 201 |
|
| 202 |
# Find time-based columns
|
|
@@ -210,10 +310,14 @@ class HealthcareAnalyzer:
|
|
| 210 |
prev_year = time_cols[i-1]
|
| 211 |
curr_year = time_cols[i]
|
| 212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
prev_total = df[prev_year].sum()
|
| 214 |
curr_total = df[curr_year].sum()
|
| 215 |
|
| 216 |
-
if prev_total > 0:
|
| 217 |
change_pct = (curr_total - prev_total) / prev_total * 100
|
| 218 |
trends[f"{prev_year}_to_{curr_year}"] = {
|
| 219 |
"absolute_change": curr_total - prev_total,
|
|
@@ -332,6 +436,8 @@ class HealthcareAnalyzer:
|
|
| 332 |
# Helper methods
|
| 333 |
def _find_column(self, df, patterns):
|
| 334 |
"""Find the first column matching any pattern"""
|
|
|
|
|
|
|
| 335 |
for col in df.columns:
|
| 336 |
if any(pattern.lower() in col.lower() for pattern in patterns):
|
| 337 |
return col
|
|
@@ -339,19 +445,34 @@ class HealthcareAnalyzer:
|
|
| 339 |
|
| 340 |
def _calculate_gini(self, values):
|
| 341 |
"""Calculate Gini coefficient for inequality measurement"""
|
|
|
|
|
|
|
|
|
|
| 342 |
values = sorted(values)
|
| 343 |
n = len(values)
|
| 344 |
index = np.arange(1, n + 1)
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
return gini
|
| 347 |
|
| 348 |
def _calculate_diversity_index(self, distribution):
|
| 349 |
"""Calculate Shannon diversity index"""
|
|
|
|
|
|
|
|
|
|
| 350 |
total = sum(distribution.values())
|
| 351 |
if total == 0:
|
| 352 |
return 0
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
def _extract_geographic_scope(self, text):
|
| 357 |
"""Extract geographic scope from text"""
|
|
|
|
| 78 |
|
| 79 |
for data_name in relevant_data:
|
| 80 |
df = self.data_registry.get(data_name)
|
| 81 |
+
if df is None or df.empty:
|
| 82 |
continue
|
| 83 |
|
| 84 |
# Geographic distribution
|
| 85 |
geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
|
| 86 |
if geo_col:
|
| 87 |
+
# Ensure we're working with string data
|
| 88 |
+
df[geo_col] = df[geo_col].astype(str)
|
| 89 |
+
alberta_mask = df[geo_col].str.lower().isin(['alberta', 'ab'])
|
| 90 |
+
ab_facilities = df[alberta_mask].copy()
|
| 91 |
|
| 92 |
+
if not ab_facilities.empty:
|
| 93 |
+
geo_dist = ab_facilities[geo_col].value_counts().to_dict()
|
| 94 |
+
results["geographic_distribution"] = geo_dist
|
| 95 |
+
|
| 96 |
+
# Calculate Gini coefficient for inequality
|
| 97 |
+
gini = self._calculate_gini(list(geo_dist.values()))
|
| 98 |
+
results["geographic_inequality"] = gini
|
| 99 |
|
| 100 |
# Facility type distribution
|
| 101 |
+
type_col = self._find_column(df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
|
| 102 |
if type_col:
|
| 103 |
+
# Ensure we're working with string data
|
| 104 |
+
df[type_col] = df[type_col].astype(str)
|
| 105 |
type_dist = df[type_col].value_counts().to_dict()
|
| 106 |
results["facility_type_distribution"] = type_dist
|
| 107 |
|
|
|
|
| 112 |
# Urban vs rural distribution
|
| 113 |
urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
|
| 114 |
if urban_col:
|
| 115 |
+
# Ensure we're working with string data
|
| 116 |
+
df[urban_col] = df[urban_col].astype(str)
|
| 117 |
urban_rural = df[urban_col].value_counts().to_dict()
|
| 118 |
results["urban_rural_distribution"] = urban_rural
|
| 119 |
+
|
| 120 |
+
# City distribution
|
| 121 |
+
city_col = self._find_column(df, ['city', 'municipality', 'town'])
|
| 122 |
+
if city_col:
|
| 123 |
+
# Ensure we're working with string data
|
| 124 |
+
df[city_col] = df[city_col].astype(str)
|
| 125 |
+
city_counts = df[city_col].value_counts().head(5)
|
| 126 |
+
top_cities = city_counts.index.tolist()
|
| 127 |
+
|
| 128 |
+
# Breakdown by facility type for top cities
|
| 129 |
+
city_breakdown = {}
|
| 130 |
+
for city in top_cities:
|
| 131 |
+
city_data = df[df[city_col] == city]
|
| 132 |
+
if not city_data.empty and type_col in city_data.columns:
|
| 133 |
+
city_breakdown[city] = city_data[type_col].value_counts().to_dict()
|
| 134 |
+
|
| 135 |
+
results["top_cities"] = top_cities
|
| 136 |
+
results["city_breakdown"] = city_breakdown
|
| 137 |
+
|
| 138 |
+
# Total facilities count
|
| 139 |
+
results["total_facilities"] = len(df)
|
| 140 |
|
| 141 |
return results
|
| 142 |
|
|
|
|
| 146 |
|
| 147 |
for data_name in relevant_data:
|
| 148 |
df = self.data_registry.get(data_name)
|
| 149 |
+
if df is None or df.empty:
|
| 150 |
continue
|
| 151 |
|
| 152 |
# Current capacity
|
| 153 |
+
capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
|
| 154 |
if capacity_col:
|
| 155 |
+
# Ensure we're working with numeric data
|
| 156 |
+
df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
|
| 157 |
total_capacity = df[capacity_col].sum()
|
| 158 |
results["total_capacity"] = total_capacity
|
| 159 |
|
| 160 |
# Capacity by facility type
|
| 161 |
type_col = self._find_column(df, ['type', 'facility_type'])
|
| 162 |
+
if type_col and type_col in df.columns:
|
| 163 |
capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
|
| 164 |
results["capacity_by_type"] = capacity_by_type
|
| 165 |
|
| 166 |
# Capacity utilization
|
| 167 |
utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
|
| 168 |
if utilization_col:
|
| 169 |
+
# Ensure we're working with numeric data
|
| 170 |
+
df[utilization_col] = pd.to_numeric(df[utilization_col], errors='coerce')
|
| 171 |
avg_utilization = df[utilization_col].mean()
|
| 172 |
results["average_utilization"] = avg_utilization
|
| 173 |
|
| 174 |
# Utilization by facility type
|
| 175 |
+
if type_col and type_col in df.columns:
|
| 176 |
utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
|
| 177 |
results["utilization_by_type"] = utilization_by_type
|
| 178 |
|
|
|
|
| 181 |
if len(time_cols) >= 2:
|
| 182 |
trend_data = {}
|
| 183 |
for col in time_cols:
|
| 184 |
+
# Ensure we're working with numeric data
|
| 185 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 186 |
trend_data[col] = df[col].sum()
|
| 187 |
results["capacity_trends"] = trend_data
|
| 188 |
|
|
|
|
| 190 |
if len(time_cols) >= 2:
|
| 191 |
latest = time_cols[-1]
|
| 192 |
earliest = time_cols[0]
|
| 193 |
+
if trend_data[earliest] > 0: # Avoid division by zero
|
| 194 |
+
growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
|
| 195 |
+
results["capacity_growth_rate"] = growth_rate
|
| 196 |
+
|
| 197 |
+
# Bed change analysis
|
| 198 |
+
prev_col = self._find_column(df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
|
| 199 |
+
current_col = self._find_column(df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
|
| 200 |
+
|
| 201 |
+
if prev_col and current_col:
|
| 202 |
+
# Ensure we're working with numeric data
|
| 203 |
+
df[prev_col] = pd.to_numeric(df[prev_col], errors='coerce')
|
| 204 |
+
df[current_col] = pd.to_numeric(df[current_col], errors='coerce')
|
| 205 |
+
|
| 206 |
+
# Calculate bed change
|
| 207 |
+
df['bed_change'] = df[current_col] - df[prev_col]
|
| 208 |
+
|
| 209 |
+
# Calculate percentage change
|
| 210 |
+
df['percent_change'] = df.apply(
|
| 211 |
+
lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
|
| 212 |
+
axis=1
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# Zone-level analysis
|
| 216 |
+
zone_col = self._find_column(df, ['zone', 'region', 'area', 'district'])
|
| 217 |
+
if zone_col:
|
| 218 |
+
# Ensure we're working with string data
|
| 219 |
+
df[zone_col] = df[zone_col].astype(str)
|
| 220 |
+
|
| 221 |
+
zone_summary = df.groupby(zone_col).agg({
|
| 222 |
+
current_col: 'sum',
|
| 223 |
+
prev_col: 'sum',
|
| 224 |
+
'bed_change': 'sum'
|
| 225 |
+
}).reset_index()
|
| 226 |
+
|
| 227 |
+
zone_summary['percent_change'] = zone_summary.apply(
|
| 228 |
+
lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
|
| 229 |
+
axis=1
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
results["zone_summary"] = zone_summary.to_dict('records')
|
| 233 |
+
|
| 234 |
+
# Find zones with largest changes
|
| 235 |
+
if not zone_summary.empty:
|
| 236 |
+
# Get zone with largest absolute decrease
|
| 237 |
+
if zone_summary['bed_change'].notna().any():
|
| 238 |
+
max_abs_decrease_idx = zone_summary['bed_change'].idxmin()
|
| 239 |
+
max_abs_decrease = zone_summary.loc[max_abs_decrease_idx]
|
| 240 |
+
results["max_absolute_decrease"] = max_abs_decrease.to_dict()
|
| 241 |
+
|
| 242 |
+
# Get zone with largest percentage decrease
|
| 243 |
+
if zone_summary['percent_change'].notna().any():
|
| 244 |
+
max_pct_decrease_idx = zone_summary['percent_change'].idxmin()
|
| 245 |
+
max_pct_decrease = zone_summary.loc[max_pct_decrease_idx]
|
| 246 |
+
results["max_percentage_decrease"] = max_pct_decrease.to_dict()
|
| 247 |
+
|
| 248 |
+
# Identify facilities with largest declines
|
| 249 |
+
facilities_decline = df.sort_values('bed_change').head(5)
|
| 250 |
+
if not facilities_decline.empty:
|
| 251 |
+
results["facilities_with_largest_declines"] = facilities_decline.to_dict('records')
|
| 252 |
|
| 253 |
return results
|
| 254 |
|
|
|
|
| 258 |
|
| 259 |
for data_name in relevant_data:
|
| 260 |
df = self.data_registry.get(data_name)
|
| 261 |
+
if df is None or df.empty:
|
| 262 |
continue
|
| 263 |
|
| 264 |
# Staff analysis
|
| 265 |
staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
|
| 266 |
if staff_col:
|
| 267 |
+
# Ensure we're working with numeric data
|
| 268 |
+
df[staff_col] = pd.to_numeric(df[staff_col], errors='coerce')
|
| 269 |
total_staff = df[staff_col].sum()
|
| 270 |
results["total_staff"] = total_staff
|
| 271 |
|
| 272 |
# Staff per bed ratio
|
| 273 |
capacity_col = self._find_column(df, ['capacity', 'beds'])
|
| 274 |
+
if capacity_col and capacity_col in df.columns:
|
| 275 |
+
# Ensure we're working with numeric data
|
| 276 |
+
df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
|
| 277 |
+
df['staff_per_bed'] = df[staff_col] / df[capacity_col].replace(0, np.nan) # Avoid division by zero
|
| 278 |
avg_staff_per_bed = df['staff_per_bed'].mean()
|
| 279 |
results["staff_per_bed_ratio"] = avg_staff_per_bed
|
| 280 |
|
|
|
|
| 283 |
if equipment_cols:
|
| 284 |
equipment_summary = {}
|
| 285 |
for col in equipment_cols:
|
| 286 |
+
# Ensure we're working with numeric data
|
| 287 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 288 |
equipment_summary[col] = df[col].sum()
|
| 289 |
results["equipment_summary"] = equipment_summary
|
| 290 |
|
|
|
|
| 296 |
|
| 297 |
for data_name in relevant_data:
|
| 298 |
df = self.data_registry.get(data_name)
|
| 299 |
+
if df is None or df.empty:
|
| 300 |
continue
|
| 301 |
|
| 302 |
# Find time-based columns
|
|
|
|
| 310 |
prev_year = time_cols[i-1]
|
| 311 |
curr_year = time_cols[i]
|
| 312 |
|
| 313 |
+
# Ensure we're working with numeric data
|
| 314 |
+
df[prev_year] = pd.to_numeric(df[prev_year], errors='coerce')
|
| 315 |
+
df[curr_year] = pd.to_numeric(df[curr_year], errors='coerce')
|
| 316 |
+
|
| 317 |
prev_total = df[prev_year].sum()
|
| 318 |
curr_total = df[curr_year].sum()
|
| 319 |
|
| 320 |
+
if prev_total > 0: # Avoid division by zero
|
| 321 |
change_pct = (curr_total - prev_total) / prev_total * 100
|
| 322 |
trends[f"{prev_year}_to_{curr_year}"] = {
|
| 323 |
"absolute_change": curr_total - prev_total,
|
|
|
|
| 436 |
# Helper methods
|
| 437 |
def _find_column(self, df, patterns):
|
| 438 |
"""Find the first column matching any pattern"""
|
| 439 |
+
if df is None or df.empty:
|
| 440 |
+
return None
|
| 441 |
for col in df.columns:
|
| 442 |
if any(pattern.lower() in col.lower() for pattern in patterns):
|
| 443 |
return col
|
|
|
|
| 445 |
|
| 446 |
def _calculate_gini(self, values):
|
| 447 |
"""Calculate Gini coefficient for inequality measurement"""
|
| 448 |
+
if not values or len(values) < 2:
|
| 449 |
+
return 0
|
| 450 |
+
|
| 451 |
values = sorted(values)
|
| 452 |
n = len(values)
|
| 453 |
index = np.arange(1, n + 1)
|
| 454 |
+
total = np.sum(values)
|
| 455 |
+
|
| 456 |
+
if total == 0:
|
| 457 |
+
return 0
|
| 458 |
+
|
| 459 |
+
gini = (np.sum((2 * index - n - 1) * values)) / (n * total)
|
| 460 |
return gini
|
| 461 |
|
| 462 |
def _calculate_diversity_index(self, distribution):
|
| 463 |
"""Calculate Shannon diversity index"""
|
| 464 |
+
if not distribution:
|
| 465 |
+
return 0
|
| 466 |
+
|
| 467 |
total = sum(distribution.values())
|
| 468 |
if total == 0:
|
| 469 |
return 0
|
| 470 |
+
|
| 471 |
+
proportions = [count/total for count in distribution.values() if count > 0]
|
| 472 |
+
if not proportions:
|
| 473 |
+
return 0
|
| 474 |
+
|
| 475 |
+
return -sum(p * np.log(p) for p in proportions)
|
| 476 |
|
| 477 |
def _extract_geographic_scope(self, text):
|
| 478 |
"""Extract geographic scope from text"""
|