Rajan Sharma commited on
Commit
379e195
·
verified ·
1 Parent(s): ef7ab85

Create healthcare_analysis.py

Browse files
Files changed (1) hide show
  1. healthcare_analysis.py +400 -0
healthcare_analysis.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # healthcare_analysis.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import Dict, List, Any, Optional, Tuple
5
+ import logging
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class HealthcareAnalyzer:
11
+ def __init__(self, data_registry):
12
+ self.data_registry = data_registry
13
+ self.analysis_results = {}
14
+
15
+ def comprehensive_analysis(self, scenario_text: str) -> Dict[str, Any]:
16
+ """Perform comprehensive healthcare scenario analysis"""
17
+ logger.info("Starting comprehensive healthcare analysis")
18
+
19
+ # Extract tasks and requirements
20
+ tasks = self._extract_tasks(scenario_text)
21
+ requirements = self._extract_requirements(scenario_text)
22
+
23
+ # Identify relevant datasets
24
+ relevant_data = self._identify_relevant_data(scenario_text)
25
+
26
+ # Perform analyses based on tasks
27
+ results = {}
28
+
29
+ if "facility_distribution" in tasks:
30
+ results["facility_distribution"] = self.analyze_facility_distribution(relevant_data)
31
+
32
+ if "capacity_analysis" in tasks:
33
+ results["capacity_analysis"] = self.analyze_capacity(relevant_data)
34
+
35
+ if "resource_allocation" in tasks:
36
+ results["resource_allocation"] = self.analyze_resource_allocation(relevant_data)
37
+
38
+ if "trends" in tasks:
39
+ results["trends"] = self.analyze_trends(relevant_data)
40
+
41
+ # Generate recommendations
42
+ results["recommendations"] = self.generate_recommendations(results, requirements)
43
+
44
+ # Future integration opportunities
45
+ results["future_integration"] = self.identify_integration_opportunities(results)
46
+
47
+ logger.info("Comprehensive analysis completed")
48
+ return results
49
+
50
+ def _extract_tasks(self, scenario_text: str) -> List[str]:
51
+ """Extract specific tasks from scenario text"""
52
+ tasks = []
53
+ task_keywords = {
54
+ "facility_distribution": ["facility", "distribution", "location", "sites"],
55
+ "capacity_analysis": ["capacity", "beds", "occupancy", "utilization"],
56
+ "resource_allocation": ["resource", "allocation", "staffing", "equipment"],
57
+ "trends": ["trend", "change", "growth", "decline", "pattern"]
58
+ }
59
+
60
+ for task_type, keywords in task_keywords.items():
61
+ if any(kw in scenario_text.lower() for kw in keywords):
62
+ tasks.append(task_type)
63
+
64
+ return tasks
65
+
66
+ def _extract_requirements(self, scenario_text: str) -> Dict[str, Any]:
67
+ """Extract specific requirements from scenario text"""
68
+ return {
69
+ "geographic_scope": self._extract_geographic_scope(scenario_text),
70
+ "time_period": self._extract_time_period(scenario_text),
71
+ "facility_types": self._extract_facility_types(scenario_text),
72
+ "metrics_needed": self._extract_metrics(scenario_text)
73
+ }
74
+
75
+ def analyze_facility_distribution(self, relevant_data: List[str]) -> Dict[str, Any]:
76
+ """Enhanced facility distribution analysis"""
77
+ results = {}
78
+
79
+ for data_name in relevant_data:
80
+ df = self.data_registry.get(data_name)
81
+ if df is None:
82
+ continue
83
+
84
+ # Geographic distribution
85
+ geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
86
+ if geo_col:
87
+ geo_dist = df[geo_col].value_counts().to_dict()
88
+ results["geographic_distribution"] = geo_dist
89
+
90
+ # Calculate Gini coefficient for inequality
91
+ gini = self._calculate_gini(list(geo_dist.values()))
92
+ results["geographic_inequality"] = gini
93
+
94
+ # Facility type distribution
95
+ type_col = self._find_column(df, ['type', 'category', 'facility_type'])
96
+ if type_col:
97
+ type_dist = df[type_col].value_counts().to_dict()
98
+ results["facility_type_distribution"] = type_dist
99
+
100
+ # Calculate diversity index
101
+ diversity = self._calculate_diversity_index(type_dist)
102
+ results["facility_diversity"] = diversity
103
+
104
+ # Urban vs rural distribution
105
+ urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
106
+ if urban_col:
107
+ urban_rural = df[urban_col].value_counts().to_dict()
108
+ results["urban_rural_distribution"] = urban_rural
109
+
110
+ return results
111
+
112
+ def analyze_capacity(self, relevant_data: List[str]) -> Dict[str, Any]:
113
+ """Enhanced capacity analysis"""
114
+ results = {}
115
+
116
+ for data_name in relevant_data:
117
+ df = self.data_registry.get(data_name)
118
+ if df is None:
119
+ continue
120
+
121
+ # Current capacity
122
+ capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity'])
123
+ if capacity_col:
124
+ total_capacity = df[capacity_col].sum()
125
+ results["total_capacity"] = total_capacity
126
+
127
+ # Capacity by facility type
128
+ type_col = self._find_column(df, ['type', 'facility_type'])
129
+ if type_col:
130
+ capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
131
+ results["capacity_by_type"] = capacity_by_type
132
+
133
+ # Capacity utilization
134
+ utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
135
+ if utilization_col:
136
+ avg_utilization = df[utilization_col].mean()
137
+ results["average_utilization"] = avg_utilization
138
+
139
+ # Utilization by facility type
140
+ if type_col:
141
+ utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
142
+ results["utilization_by_type"] = utilization_by_type
143
+
144
+ # Capacity trends
145
+ time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
146
+ if len(time_cols) >= 2:
147
+ trend_data = {}
148
+ for col in time_cols:
149
+ trend_data[col] = df[col].sum()
150
+ results["capacity_trends"] = trend_data
151
+
152
+ # Calculate growth rate
153
+ if len(time_cols) >= 2:
154
+ latest = time_cols[-1]
155
+ earliest = time_cols[0]
156
+ growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
157
+ results["capacity_growth_rate"] = growth_rate
158
+
159
+ return results
160
+
161
+ def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
162
+ """Analyze resource allocation patterns"""
163
+ results = {}
164
+
165
+ for data_name in relevant_data:
166
+ df = self.data_registry.get(data_name)
167
+ if df is None:
168
+ continue
169
+
170
+ # Staff analysis
171
+ staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
172
+ if staff_col:
173
+ total_staff = df[staff_col].sum()
174
+ results["total_staff"] = total_staff
175
+
176
+ # Staff per bed ratio
177
+ capacity_col = self._find_column(df, ['capacity', 'beds'])
178
+ if capacity_col:
179
+ df['staff_per_bed'] = df[staff_col] / df[capacity_col]
180
+ avg_staff_per_bed = df['staff_per_bed'].mean()
181
+ results["staff_per_bed_ratio"] = avg_staff_per_bed
182
+
183
+ # Equipment analysis
184
+ equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
185
+ if equipment_cols:
186
+ equipment_summary = {}
187
+ for col in equipment_cols:
188
+ equipment_summary[col] = df[col].sum()
189
+ results["equipment_summary"] = equipment_summary
190
+
191
+ return results
192
+
193
+ def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
194
+ """Analyze trends in healthcare data"""
195
+ results = {}
196
+
197
+ for data_name in relevant_data:
198
+ df = self.data_registry.get(data_name)
199
+ if df is None:
200
+ continue
201
+
202
+ # Find time-based columns
203
+ time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
204
+
205
+ if len(time_cols) >= 2:
206
+ trends = {}
207
+
208
+ # Calculate year-over-year changes
209
+ for i in range(1, len(time_cols)):
210
+ prev_year = time_cols[i-1]
211
+ curr_year = time_cols[i]
212
+
213
+ prev_total = df[prev_year].sum()
214
+ curr_total = df[curr_year].sum()
215
+
216
+ if prev_total > 0:
217
+ change_pct = (curr_total - prev_total) / prev_total * 100
218
+ trends[f"{prev_year}_to_{curr_year}"] = {
219
+ "absolute_change": curr_total - prev_total,
220
+ "percentage_change": change_pct
221
+ }
222
+
223
+ results["year_over_year_trends"] = trends
224
+
225
+ return results
226
+
227
+ def generate_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
228
+ """Generate data-driven operational recommendations"""
229
+ recommendations = []
230
+
231
+ # Capacity-related recommendations
232
+ if "capacity_analysis" in analysis_results:
233
+ capacity = analysis_results["capacity_analysis"]
234
+
235
+ # Low utilization recommendations
236
+ if "average_utilization" in capacity and capacity["average_utilization"] < 0.7:
237
+ recommendations.append({
238
+ "title": "Optimize Underutilized Capacity",
239
+ "description": f"Average utilization is {capacity['average_utilization']:.1%}. Consider repurposing underutilized facilities or consolidating services.",
240
+ "priority": "Medium",
241
+ "data_source": "Capacity utilization analysis"
242
+ })
243
+
244
+ # Capacity growth recommendations
245
+ if "capacity_growth_rate" in capacity and capacity["capacity_growth_rate"] < 2:
246
+ recommendations.append({
247
+ "title": "Expand Capacity Strategically",
248
+ "description": f"Capacity growth rate is only {capacity['capacity_growth_rate']:.1f}%. Invest in new facilities or expand existing ones to meet demand.",
249
+ "priority": "High",
250
+ "data_source": "Capacity trend analysis"
251
+ })
252
+
253
+ # Geographic distribution recommendations
254
+ if "facility_distribution" in analysis_results:
255
+ dist = analysis_results["facility_distribution"]
256
+
257
+ if "geographic_inequality" in dist and dist["geographic_inequality"] > 0.4:
258
+ recommendations.append({
259
+ "title": "Address Geographic Inequity",
260
+ "description": f"High geographic inequality (Gini: {dist['geographic_inequality']:.2f}). Consider targeted investments in underserved areas.",
261
+ "priority": "High",
262
+ "data_source": "Geographic distribution analysis"
263
+ })
264
+
265
+ # Resource allocation recommendations
266
+ if "resource_allocation" in analysis_results:
267
+ resources = analysis_results["resource_allocation"]
268
+
269
+ if "staff_per_bed_ratio" in resources and resources["staff_per_bed_ratio"] < 1.5:
270
+ recommendations.append({
271
+ "title": "Increase Staffing Levels",
272
+ "description": f"Staff per bed ratio is {resources['staff_per_bed_ratio']:.2f}, which may be insufficient. Consider hiring additional staff.",
273
+ "priority": "High",
274
+ "data_source": "Resource allocation analysis"
275
+ })
276
+
277
+ # Sort by priority
278
+ priority_order = {"High": 0, "Medium": 1, "Low": 2}
279
+ recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3))
280
+
281
+ return recommendations
282
+
283
+ def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
284
+ """Identify opportunities for AI integration and data enhancement"""
285
+ opportunities = {
286
+ "data_integration": [],
287
+ "ai_applications": [],
288
+ "enhanced_metrics": []
289
+ }
290
+
291
+ # Data integration opportunities
292
+ opportunities["data_integration"].append({
293
+ "opportunity": "Integrate real-time occupancy data",
294
+ "description": "Combine current facility data with real-time occupancy monitoring systems",
295
+ "benefit": "Enable dynamic resource allocation and surge planning"
296
+ })
297
+
298
+ opportunities["data_integration"].append({
299
+ "opportunity": "Incorporate demographic data",
300
+ "description": "Add population demographics and health needs data",
301
+ "benefit": "Improve demand forecasting and service planning"
302
+ })
303
+
304
+ # AI application opportunities
305
+ opportunities["ai_applications"].append({
306
+ "opportunity": "Predictive capacity modeling",
307
+ "description": "Use ML to forecast capacity needs based on trends and external factors",
308
+ "benefit": "Proactive resource planning and reduced wait times"
309
+ })
310
+
311
+ opportunities["ai_applications"].append({
312
+ "opportunity": "Optimization algorithms",
313
+ "description": "Implement AI for staff scheduling and resource allocation",
314
+ "benefit": "Improved efficiency and reduced operational costs"
315
+ })
316
+
317
+ # Enhanced metrics
318
+ opportunities["enhanced_metrics"].append({
319
+ "metric": "Patient flow efficiency",
320
+ "description": "Measure time from admission to discharge across facilities",
321
+ "benefit": "Identify bottlenecks and improve patient experience"
322
+ })
323
+
324
+ opportunities["enhanced_metrics"].append({
325
+ "metric": "Resource utilization index",
326
+ "description": "Composite metric combining staff, equipment, and space utilization",
327
+ "benefit": "Holistic view of operational efficiency"
328
+ })
329
+
330
+ return opportunities
331
+
332
+ # Helper methods
333
+ def _find_column(self, df, patterns):
334
+ """Find the first column matching any pattern"""
335
+ for col in df.columns:
336
+ if any(pattern.lower() in col.lower() for pattern in patterns):
337
+ return col
338
+ return None
339
+
340
+ def _calculate_gini(self, values):
341
+ """Calculate Gini coefficient for inequality measurement"""
342
+ values = sorted(values)
343
+ n = len(values)
344
+ index = np.arange(1, n + 1)
345
+ gini = (np.sum((2 * index - n - 1) * values)) / (n * np.sum(values))
346
+ return gini
347
+
348
+ def _calculate_diversity_index(self, distribution):
349
+ """Calculate Shannon diversity index"""
350
+ total = sum(distribution.values())
351
+ if total == 0:
352
+ return 0
353
+ proportions = [count/total for count in distribution.values()]
354
+ return -sum(p * np.log(p) for p in proportions if p > 0)
355
+
356
+ def _extract_geographic_scope(self, text):
357
+ """Extract geographic scope from text"""
358
+ # Simple keyword-based extraction
359
+ if "alberta" in text.lower():
360
+ return "Alberta"
361
+ elif "canada" in text.lower():
362
+ return "Canada"
363
+ return "Unknown"
364
+
365
+ def _extract_time_period(self, text):
366
+ """Extract time period from text"""
367
+ # Look for year patterns
368
+ import re
369
+ years = re.findall(r'\b(20\d{2})\b', text)
370
+ if len(years) >= 2:
371
+ return f"{min(years)}-{max(years)}"
372
+ return "Unknown"
373
+
374
+ def _extract_facility_types(self, text):
375
+ """Extract facility types from text"""
376
+ types = []
377
+ if "hospital" in text.lower():
378
+ types.append("Hospitals")
379
+ if "nursing" in text.lower() or "long-term" in text.lower():
380
+ types.append("Nursing homes")
381
+ if "clinic" in text.lower():
382
+ types.append("Clinics")
383
+ return types
384
+
385
+ def _extract_metrics(self, text):
386
+ """Extract required metrics from text"""
387
+ metrics = []
388
+ if "bed" in text.lower():
389
+ metrics.append("Bed capacity")
390
+ if "occupancy" in text.lower():
391
+ metrics.append("Occupancy rates")
392
+ if "staff" in text.lower():
393
+ metrics.append("Staffing levels")
394
+ return metrics
395
+
396
+ def _identify_relevant_data(self, text):
397
+ """Identify relevant datasets for the scenario"""
398
+ # Use data registry's find_related_datasets method
399
+ keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
400
+ return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]