Rajan Sharma commited on
Commit
399738b
·
verified ·
1 Parent(s): 1cc2098

Update healthcare_analysis.py

Browse files
Files changed (1) hide show
  1. healthcare_analysis.py +142 -21
healthcare_analysis.py CHANGED
@@ -78,22 +78,30 @@ class HealthcareAnalyzer:
78
 
79
  for data_name in relevant_data:
80
  df = self.data_registry.get(data_name)
81
- if df is None:
82
  continue
83
 
84
  # Geographic distribution
85
  geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
86
  if geo_col:
87
- geo_dist = df[geo_col].value_counts().to_dict()
88
- results["geographic_distribution"] = geo_dist
 
 
89
 
90
- # Calculate Gini coefficient for inequality
91
- gini = self._calculate_gini(list(geo_dist.values()))
92
- results["geographic_inequality"] = gini
 
 
 
 
93
 
94
  # Facility type distribution
95
- type_col = self._find_column(df, ['type', 'category', 'facility_type'])
96
  if type_col:
 
 
97
  type_dist = df[type_col].value_counts().to_dict()
98
  results["facility_type_distribution"] = type_dist
99
 
@@ -104,8 +112,31 @@ class HealthcareAnalyzer:
104
  # Urban vs rural distribution
105
  urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
106
  if urban_col:
 
 
107
  urban_rural = df[urban_col].value_counts().to_dict()
108
  results["urban_rural_distribution"] = urban_rural
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  return results
111
 
@@ -115,29 +146,33 @@ class HealthcareAnalyzer:
115
 
116
  for data_name in relevant_data:
117
  df = self.data_registry.get(data_name)
118
- if df is None:
119
  continue
120
 
121
  # Current capacity
122
- capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity'])
123
  if capacity_col:
 
 
124
  total_capacity = df[capacity_col].sum()
125
  results["total_capacity"] = total_capacity
126
 
127
  # Capacity by facility type
128
  type_col = self._find_column(df, ['type', 'facility_type'])
129
- if type_col:
130
  capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
131
  results["capacity_by_type"] = capacity_by_type
132
 
133
  # Capacity utilization
134
  utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
135
  if utilization_col:
 
 
136
  avg_utilization = df[utilization_col].mean()
137
  results["average_utilization"] = avg_utilization
138
 
139
  # Utilization by facility type
140
- if type_col:
141
  utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
142
  results["utilization_by_type"] = utilization_by_type
143
 
@@ -146,6 +181,8 @@ class HealthcareAnalyzer:
146
  if len(time_cols) >= 2:
147
  trend_data = {}
148
  for col in time_cols:
 
 
149
  trend_data[col] = df[col].sum()
150
  results["capacity_trends"] = trend_data
151
 
@@ -153,8 +190,65 @@ class HealthcareAnalyzer:
153
  if len(time_cols) >= 2:
154
  latest = time_cols[-1]
155
  earliest = time_cols[0]
156
- growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
157
- results["capacity_growth_rate"] = growth_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  return results
160
 
@@ -164,19 +258,23 @@ class HealthcareAnalyzer:
164
 
165
  for data_name in relevant_data:
166
  df = self.data_registry.get(data_name)
167
- if df is None:
168
  continue
169
 
170
  # Staff analysis
171
  staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
172
  if staff_col:
 
 
173
  total_staff = df[staff_col].sum()
174
  results["total_staff"] = total_staff
175
 
176
  # Staff per bed ratio
177
  capacity_col = self._find_column(df, ['capacity', 'beds'])
178
- if capacity_col:
179
- df['staff_per_bed'] = df[staff_col] / df[capacity_col]
 
 
180
  avg_staff_per_bed = df['staff_per_bed'].mean()
181
  results["staff_per_bed_ratio"] = avg_staff_per_bed
182
 
@@ -185,6 +283,8 @@ class HealthcareAnalyzer:
185
  if equipment_cols:
186
  equipment_summary = {}
187
  for col in equipment_cols:
 
 
188
  equipment_summary[col] = df[col].sum()
189
  results["equipment_summary"] = equipment_summary
190
 
@@ -196,7 +296,7 @@ class HealthcareAnalyzer:
196
 
197
  for data_name in relevant_data:
198
  df = self.data_registry.get(data_name)
199
- if df is None:
200
  continue
201
 
202
  # Find time-based columns
@@ -210,10 +310,14 @@ class HealthcareAnalyzer:
210
  prev_year = time_cols[i-1]
211
  curr_year = time_cols[i]
212
 
 
 
 
 
213
  prev_total = df[prev_year].sum()
214
  curr_total = df[curr_year].sum()
215
 
216
- if prev_total > 0:
217
  change_pct = (curr_total - prev_total) / prev_total * 100
218
  trends[f"{prev_year}_to_{curr_year}"] = {
219
  "absolute_change": curr_total - prev_total,
@@ -332,6 +436,8 @@ class HealthcareAnalyzer:
332
  # Helper methods
333
  def _find_column(self, df, patterns):
334
  """Find the first column matching any pattern"""
 
 
335
  for col in df.columns:
336
  if any(pattern.lower() in col.lower() for pattern in patterns):
337
  return col
@@ -339,19 +445,34 @@ class HealthcareAnalyzer:
339
 
340
  def _calculate_gini(self, values):
341
  """Calculate Gini coefficient for inequality measurement"""
 
 
 
342
  values = sorted(values)
343
  n = len(values)
344
  index = np.arange(1, n + 1)
345
- gini = (np.sum((2 * index - n - 1) * values)) / (n * np.sum(values))
 
 
 
 
 
346
  return gini
347
 
348
  def _calculate_diversity_index(self, distribution):
349
  """Calculate Shannon diversity index"""
 
 
 
350
  total = sum(distribution.values())
351
  if total == 0:
352
  return 0
353
- proportions = [count/total for count in distribution.values()]
354
- return -sum(p * np.log(p) for p in proportions if p > 0)
 
 
 
 
355
 
356
  def _extract_geographic_scope(self, text):
357
  """Extract geographic scope from text"""
 
78
 
79
  for data_name in relevant_data:
80
  df = self.data_registry.get(data_name)
81
+ if df is None or df.empty:
82
  continue
83
 
84
  # Geographic distribution
85
  geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
86
  if geo_col:
87
+ # Ensure we're working with string data
88
+ df[geo_col] = df[geo_col].astype(str)
89
+ alberta_mask = df[geo_col].str.lower().isin(['alberta', 'ab'])
90
+ ab_facilities = df[alberta_mask].copy()
91
 
92
+ if not ab_facilities.empty:
93
+ geo_dist = ab_facilities[geo_col].value_counts().to_dict()
94
+ results["geographic_distribution"] = geo_dist
95
+
96
+ # Calculate Gini coefficient for inequality
97
+ gini = self._calculate_gini(list(geo_dist.values()))
98
+ results["geographic_inequality"] = gini
99
 
100
  # Facility type distribution
101
+ type_col = self._find_column(df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
102
  if type_col:
103
+ # Ensure we're working with string data
104
+ df[type_col] = df[type_col].astype(str)
105
  type_dist = df[type_col].value_counts().to_dict()
106
  results["facility_type_distribution"] = type_dist
107
 
 
112
  # Urban vs rural distribution
113
  urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
114
  if urban_col:
115
+ # Ensure we're working with string data
116
+ df[urban_col] = df[urban_col].astype(str)
117
  urban_rural = df[urban_col].value_counts().to_dict()
118
  results["urban_rural_distribution"] = urban_rural
119
+
120
+ # City distribution
121
+ city_col = self._find_column(df, ['city', 'municipality', 'town'])
122
+ if city_col:
123
+ # Ensure we're working with string data
124
+ df[city_col] = df[city_col].astype(str)
125
+ city_counts = df[city_col].value_counts().head(5)
126
+ top_cities = city_counts.index.tolist()
127
+
128
+ # Breakdown by facility type for top cities
129
+ city_breakdown = {}
130
+ for city in top_cities:
131
+ city_data = df[df[city_col] == city]
132
+ if not city_data.empty and type_col in city_data.columns:
133
+ city_breakdown[city] = city_data[type_col].value_counts().to_dict()
134
+
135
+ results["top_cities"] = top_cities
136
+ results["city_breakdown"] = city_breakdown
137
+
138
+ # Total facilities count
139
+ results["total_facilities"] = len(df)
140
 
141
  return results
142
 
 
146
 
147
  for data_name in relevant_data:
148
  df = self.data_registry.get(data_name)
149
+ if df is None or df.empty:
150
  continue
151
 
152
  # Current capacity
153
+ capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
154
  if capacity_col:
155
+ # Ensure we're working with numeric data
156
+ df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
157
  total_capacity = df[capacity_col].sum()
158
  results["total_capacity"] = total_capacity
159
 
160
  # Capacity by facility type
161
  type_col = self._find_column(df, ['type', 'facility_type'])
162
+ if type_col and type_col in df.columns:
163
  capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
164
  results["capacity_by_type"] = capacity_by_type
165
 
166
  # Capacity utilization
167
  utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
168
  if utilization_col:
169
+ # Ensure we're working with numeric data
170
+ df[utilization_col] = pd.to_numeric(df[utilization_col], errors='coerce')
171
  avg_utilization = df[utilization_col].mean()
172
  results["average_utilization"] = avg_utilization
173
 
174
  # Utilization by facility type
175
+ if type_col and type_col in df.columns:
176
  utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
177
  results["utilization_by_type"] = utilization_by_type
178
 
 
181
  if len(time_cols) >= 2:
182
  trend_data = {}
183
  for col in time_cols:
184
+ # Ensure we're working with numeric data
185
+ df[col] = pd.to_numeric(df[col], errors='coerce')
186
  trend_data[col] = df[col].sum()
187
  results["capacity_trends"] = trend_data
188
 
 
190
  if len(time_cols) >= 2:
191
  latest = time_cols[-1]
192
  earliest = time_cols[0]
193
+ if trend_data[earliest] > 0: # Avoid division by zero
194
+ growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
195
+ results["capacity_growth_rate"] = growth_rate
196
+
197
+ # Bed change analysis
198
+ prev_col = self._find_column(df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
199
+ current_col = self._find_column(df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
200
+
201
+ if prev_col and current_col:
202
+ # Ensure we're working with numeric data
203
+ df[prev_col] = pd.to_numeric(df[prev_col], errors='coerce')
204
+ df[current_col] = pd.to_numeric(df[current_col], errors='coerce')
205
+
206
+ # Calculate bed change
207
+ df['bed_change'] = df[current_col] - df[prev_col]
208
+
209
+ # Calculate percentage change
210
+ df['percent_change'] = df.apply(
211
+ lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
212
+ axis=1
213
+ )
214
+
215
+ # Zone-level analysis
216
+ zone_col = self._find_column(df, ['zone', 'region', 'area', 'district'])
217
+ if zone_col:
218
+ # Ensure we're working with string data
219
+ df[zone_col] = df[zone_col].astype(str)
220
+
221
+ zone_summary = df.groupby(zone_col).agg({
222
+ current_col: 'sum',
223
+ prev_col: 'sum',
224
+ 'bed_change': 'sum'
225
+ }).reset_index()
226
+
227
+ zone_summary['percent_change'] = zone_summary.apply(
228
+ lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
229
+ axis=1
230
+ )
231
+
232
+ results["zone_summary"] = zone_summary.to_dict('records')
233
+
234
+ # Find zones with largest changes
235
+ if not zone_summary.empty:
236
+ # Get zone with largest absolute decrease
237
+ if zone_summary['bed_change'].notna().any():
238
+ max_abs_decrease_idx = zone_summary['bed_change'].idxmin()
239
+ max_abs_decrease = zone_summary.loc[max_abs_decrease_idx]
240
+ results["max_absolute_decrease"] = max_abs_decrease.to_dict()
241
+
242
+ # Get zone with largest percentage decrease
243
+ if zone_summary['percent_change'].notna().any():
244
+ max_pct_decrease_idx = zone_summary['percent_change'].idxmin()
245
+ max_pct_decrease = zone_summary.loc[max_pct_decrease_idx]
246
+ results["max_percentage_decrease"] = max_pct_decrease.to_dict()
247
+
248
+ # Identify facilities with largest declines
249
+ facilities_decline = df.sort_values('bed_change').head(5)
250
+ if not facilities_decline.empty:
251
+ results["facilities_with_largest_declines"] = facilities_decline.to_dict('records')
252
 
253
  return results
254
 
 
258
 
259
  for data_name in relevant_data:
260
  df = self.data_registry.get(data_name)
261
+ if df is None or df.empty:
262
  continue
263
 
264
  # Staff analysis
265
  staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
266
  if staff_col:
267
+ # Ensure we're working with numeric data
268
+ df[staff_col] = pd.to_numeric(df[staff_col], errors='coerce')
269
  total_staff = df[staff_col].sum()
270
  results["total_staff"] = total_staff
271
 
272
  # Staff per bed ratio
273
  capacity_col = self._find_column(df, ['capacity', 'beds'])
274
+ if capacity_col and capacity_col in df.columns:
275
+ # Ensure we're working with numeric data
276
+ df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
277
+ df['staff_per_bed'] = df[staff_col] / df[capacity_col].replace(0, np.nan) # Avoid division by zero
278
  avg_staff_per_bed = df['staff_per_bed'].mean()
279
  results["staff_per_bed_ratio"] = avg_staff_per_bed
280
 
 
283
  if equipment_cols:
284
  equipment_summary = {}
285
  for col in equipment_cols:
286
+ # Ensure we're working with numeric data
287
+ df[col] = pd.to_numeric(df[col], errors='coerce')
288
  equipment_summary[col] = df[col].sum()
289
  results["equipment_summary"] = equipment_summary
290
 
 
296
 
297
  for data_name in relevant_data:
298
  df = self.data_registry.get(data_name)
299
+ if df is None or df.empty:
300
  continue
301
 
302
  # Find time-based columns
 
310
  prev_year = time_cols[i-1]
311
  curr_year = time_cols[i]
312
 
313
+ # Ensure we're working with numeric data
314
+ df[prev_year] = pd.to_numeric(df[prev_year], errors='coerce')
315
+ df[curr_year] = pd.to_numeric(df[curr_year], errors='coerce')
316
+
317
  prev_total = df[prev_year].sum()
318
  curr_total = df[curr_year].sum()
319
 
320
+ if prev_total > 0: # Avoid division by zero
321
  change_pct = (curr_total - prev_total) / prev_total * 100
322
  trends[f"{prev_year}_to_{curr_year}"] = {
323
  "absolute_change": curr_total - prev_total,
 
436
  # Helper methods
437
  def _find_column(self, df, patterns):
438
  """Find the first column matching any pattern"""
439
+ if df is None or df.empty:
440
+ return None
441
  for col in df.columns:
442
  if any(pattern.lower() in col.lower() for pattern in patterns):
443
  return col
 
445
 
446
  def _calculate_gini(self, values):
447
  """Calculate Gini coefficient for inequality measurement"""
448
+ if not values or len(values) < 2:
449
+ return 0
450
+
451
  values = sorted(values)
452
  n = len(values)
453
  index = np.arange(1, n + 1)
454
+ total = np.sum(values)
455
+
456
+ if total == 0:
457
+ return 0
458
+
459
+ gini = (np.sum((2 * index - n - 1) * values)) / (n * total)
460
  return gini
461
 
462
  def _calculate_diversity_index(self, distribution):
463
  """Calculate Shannon diversity index"""
464
+ if not distribution:
465
+ return 0
466
+
467
  total = sum(distribution.values())
468
  if total == 0:
469
  return 0
470
+
471
+ proportions = [count/total for count in distribution.values() if count > 0]
472
+ if not proportions:
473
+ return 0
474
+
475
+ return -sum(p * np.log(p) for p in proportions)
476
 
477
  def _extract_geographic_scope(self, text):
478
  """Extract geographic scope from text"""