Spaces:
Sleeping
Sleeping
Rajan Sharma
commited on
Update healthcare_analysis.py
Browse files- healthcare_analysis.py +8 -927
healthcare_analysis.py
CHANGED
|
@@ -1,932 +1,13 @@
|
|
| 1 |
# healthcare_analysis.py
|
| 2 |
import pandas as pd
|
| 3 |
-
|
| 4 |
-
from typing import Dict, List, Any, Optional, Tuple
|
| 5 |
-
import logging
|
| 6 |
-
import re
|
| 7 |
-
|
| 8 |
-
logging.basicConfig(level=logging.INFO)
|
| 9 |
-
logger = logging.getLogger(__name__)
|
| 10 |
|
| 11 |
class HealthcareAnalyzer:
|
| 12 |
-
def __init__(self,
|
| 13 |
-
self.
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
logger.info("Starting comprehensive healthcare analysis")
|
| 20 |
-
|
| 21 |
-
self.scenario_text = scenario_text
|
| 22 |
-
|
| 23 |
-
# Extract all requirements and tasks
|
| 24 |
-
requirements = self._extract_all_requirements(scenario_text)
|
| 25 |
-
tasks = self._extract_detailed_tasks(scenario_text)
|
| 26 |
-
|
| 27 |
-
# Identify relevant datasets
|
| 28 |
-
relevant_data = self._identify_relevant_data(scenario_text)
|
| 29 |
-
|
| 30 |
-
# Perform all analyses based on tasks
|
| 31 |
-
results = {
|
| 32 |
-
"requirements": requirements,
|
| 33 |
-
"tasks_completed": [],
|
| 34 |
-
"data_sources": relevant_data
|
| 35 |
-
}
|
| 36 |
-
|
| 37 |
-
# Data Preparation Tasks
|
| 38 |
-
if "data_preparation" in tasks:
|
| 39 |
-
results["data_preparation"] = self.analyze_data_preparation(relevant_data, requirements)
|
| 40 |
-
results["tasks_completed"].append("data_preparation")
|
| 41 |
-
|
| 42 |
-
# Facility Distribution Analysis
|
| 43 |
-
if "facility_distribution" in tasks:
|
| 44 |
-
results["facility_distribution"] = self.analyze_facility_distribution(relevant_data, requirements)
|
| 45 |
-
results["tasks_completed"].append("facility_distribution")
|
| 46 |
-
|
| 47 |
-
# Capacity Analysis
|
| 48 |
-
if "capacity_analysis" in tasks:
|
| 49 |
-
results["capacity_analysis"] = self.analyze_capacity(relevant_data, requirements)
|
| 50 |
-
results["tasks_completed"].append("capacity_analysis")
|
| 51 |
-
|
| 52 |
-
# Long-Term Care Assessment (specific to scenario requirements)
|
| 53 |
-
if "long_term_care_assessment" in tasks:
|
| 54 |
-
results["long_term_care_assessment"] = self.analyze_long_term_care_capacity(results, requirements)
|
| 55 |
-
results["tasks_completed"].append("long_term_care_assessment")
|
| 56 |
-
|
| 57 |
-
# Resource Allocation Analysis
|
| 58 |
-
if "resource_allocation" in tasks:
|
| 59 |
-
results["resource_allocation"] = self.analyze_resource_allocation(relevant_data)
|
| 60 |
-
results["tasks_completed"].append("resource_allocation")
|
| 61 |
-
|
| 62 |
-
# Trends Analysis
|
| 63 |
-
if "trends" in tasks:
|
| 64 |
-
results["trends"] = self.analyze_trends(relevant_data)
|
| 65 |
-
results["tasks_completed"].append("trends")
|
| 66 |
-
|
| 67 |
-
# Generate recommendations
|
| 68 |
-
if "operational_recommendations" in tasks:
|
| 69 |
-
results["recommendations"] = self.generate_operational_recommendations(results, requirements)
|
| 70 |
-
results["tasks_completed"].append("operational_recommendations")
|
| 71 |
-
|
| 72 |
-
# Future Integration Opportunities
|
| 73 |
-
if "future_integration" in tasks:
|
| 74 |
-
results["future_integration"] = self.identify_integration_opportunities(results)
|
| 75 |
-
results["tasks_completed"].append("future_integration")
|
| 76 |
-
|
| 77 |
-
# Validate that all required tasks were completed
|
| 78 |
-
validation_result = self.validate_analysis_completeness(tasks, results["tasks_completed"])
|
| 79 |
-
results["validation"] = validation_result
|
| 80 |
-
|
| 81 |
-
logger.info("Comprehensive analysis completed")
|
| 82 |
-
return results
|
| 83 |
-
|
| 84 |
-
def _extract_all_requirements(self, scenario_text: str) -> Dict[str, Any]:
|
| 85 |
-
"""Extract all specific requirements from scenario text"""
|
| 86 |
-
requirements = {
|
| 87 |
-
"geographic_scope": self._extract_geographic_scope(scenario_text),
|
| 88 |
-
"time_period": self._extract_time_period(scenario_text),
|
| 89 |
-
"facility_types": self._extract_facility_types(scenario_text),
|
| 90 |
-
"metrics_needed": self._extract_metrics(scenario_text),
|
| 91 |
-
"regions": self._extract_regions(scenario_text),
|
| 92 |
-
"data_files": self._extract_data_files(scenario_text),
|
| 93 |
-
"specific_questions": self._extract_specific_questions(scenario_text)
|
| 94 |
-
}
|
| 95 |
-
return requirements
|
| 96 |
-
|
| 97 |
-
def _extract_detailed_tasks(self, scenario_text: str) -> List[str]:
|
| 98 |
-
"""Extract detailed tasks from scenario text"""
|
| 99 |
-
tasks = []
|
| 100 |
-
text_lower = scenario_text.lower()
|
| 101 |
-
|
| 102 |
-
# Data preparation tasks
|
| 103 |
-
if any(phrase in text_lower for phrase in ["load the data", "data preparation", "frequency table"]):
|
| 104 |
-
tasks.append("data_preparation")
|
| 105 |
-
|
| 106 |
-
# Facility distribution tasks
|
| 107 |
-
if any(phrase in text_lower for phrase in ["facility distribution", "cities with highest", "facility type"]):
|
| 108 |
-
tasks.append("facility_distribution")
|
| 109 |
-
|
| 110 |
-
# Capacity analysis tasks
|
| 111 |
-
if any(phrase in text_lower for phrase in ["bed capacity", "capacity analysis", "bed_change"]):
|
| 112 |
-
tasks.append("capacity_analysis")
|
| 113 |
-
|
| 114 |
-
# Long-term care assessment tasks
|
| 115 |
-
if any(phrase in text_lower for phrase in ["long-term care", "long term care", "nursing care"]):
|
| 116 |
-
tasks.append("long_term_care_assessment")
|
| 117 |
-
|
| 118 |
-
# Resource allocation tasks
|
| 119 |
-
if any(phrase in text_lower for phrase in ["resource allocation", "staffing", "equipment"]):
|
| 120 |
-
tasks.append("resource_allocation")
|
| 121 |
-
|
| 122 |
-
# Trends analysis tasks
|
| 123 |
-
if any(phrase in text_lower for phrase in ["trends", "change", "growth", "decline"]):
|
| 124 |
-
tasks.append("trends")
|
| 125 |
-
|
| 126 |
-
# Operational recommendations tasks
|
| 127 |
-
if any(phrase in text_lower for phrase in ["operational recommendations", "recommend actions", "mitigate shortages"]):
|
| 128 |
-
tasks.append("operational_recommendations")
|
| 129 |
-
|
| 130 |
-
# Future integration tasks
|
| 131 |
-
if any(phrase in text_lower for phrase in ["future integration", "augmented ai", "decision-making"]):
|
| 132 |
-
tasks.append("future_integration")
|
| 133 |
-
|
| 134 |
-
return tasks
|
| 135 |
-
|
| 136 |
-
def _extract_specific_questions(self, scenario_text: str) -> List[str]:
|
| 137 |
-
"""Extract specific questions from scenario text"""
|
| 138 |
-
questions = []
|
| 139 |
-
|
| 140 |
-
# Look for question patterns
|
| 141 |
-
question_patterns = [
|
| 142 |
-
r'which zone shows the largest',
|
| 143 |
-
r'which zone has the largest',
|
| 144 |
-
r'list the five',
|
| 145 |
-
r'does this city have',
|
| 146 |
-
r'provide the numbers to justify',
|
| 147 |
-
r'propose at least',
|
| 148 |
-
r'mention at least'
|
| 149 |
-
]
|
| 150 |
-
|
| 151 |
-
for pattern in question_patterns:
|
| 152 |
-
matches = re.findall(pattern, scenario_text, re.IGNORECASE)
|
| 153 |
-
questions.extend(matches)
|
| 154 |
-
|
| 155 |
-
return questions
|
| 156 |
-
|
| 157 |
-
def _extract_data_files(self, scenario_text: str) -> List[str]:
|
| 158 |
-
"""Extract data file names from scenario text"""
|
| 159 |
-
files = []
|
| 160 |
-
|
| 161 |
-
# Look for file patterns
|
| 162 |
-
file_patterns = [
|
| 163 |
-
r'([a-zA-Z_]+\.csv)',
|
| 164 |
-
r'([a-zA-Z_]+\.xlsx)',
|
| 165 |
-
r'([a-zA-Z_]+\.json)'
|
| 166 |
-
]
|
| 167 |
-
|
| 168 |
-
for pattern in file_patterns:
|
| 169 |
-
matches = re.findall(pattern, scenario_text)
|
| 170 |
-
files.extend(matches)
|
| 171 |
-
|
| 172 |
-
return list(set(files)) # Remove duplicates
|
| 173 |
-
|
| 174 |
-
def analyze_data_preparation(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
|
| 175 |
-
"""Enhanced data preparation analysis"""
|
| 176 |
-
results = {}
|
| 177 |
-
geographic_scope = requirements.get("geographic_scope", "Unknown")
|
| 178 |
-
regions = requirements.get("regions", [])
|
| 179 |
-
|
| 180 |
-
for data_name in relevant_data:
|
| 181 |
-
df = self.data_registry.get(data_name)
|
| 182 |
-
if df is None or df.empty:
|
| 183 |
-
continue
|
| 184 |
-
|
| 185 |
-
# Filter data based on geographic scope
|
| 186 |
-
filtered_df = self._filter_by_geography(df, geographic_scope, regions)
|
| 187 |
-
|
| 188 |
-
if filtered_df.empty:
|
| 189 |
-
continue
|
| 190 |
-
|
| 191 |
-
# Facility type frequency table
|
| 192 |
-
type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
|
| 193 |
-
if type_col:
|
| 194 |
-
filtered_df[type_col] = filtered_df[type_col].astype(str)
|
| 195 |
-
type_freq = filtered_df[type_col].value_counts().to_dict()
|
| 196 |
-
results["facility_type_frequency"] = type_freq
|
| 197 |
-
|
| 198 |
-
# Top cities analysis
|
| 199 |
-
city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
|
| 200 |
-
if city_col:
|
| 201 |
-
filtered_df[city_col] = filtered_df[city_col].astype(str)
|
| 202 |
-
city_counts = filtered_df[city_col].value_counts().head(5)
|
| 203 |
-
top_cities = city_counts.index.tolist()
|
| 204 |
-
|
| 205 |
-
# Breakdown by facility type for each top city
|
| 206 |
-
city_breakdown = {}
|
| 207 |
-
for city in top_cities:
|
| 208 |
-
city_data = filtered_df[filtered_df[city_col] == city]
|
| 209 |
-
if not city_data.empty and type_col in city_data.columns:
|
| 210 |
-
city_breakdown[city] = city_data[type_col].value_counts().to_dict()
|
| 211 |
-
|
| 212 |
-
results["top_cities"] = top_cities
|
| 213 |
-
results["city_facility_breakdown"] = city_breakdown
|
| 214 |
-
|
| 215 |
-
# Total facilities count
|
| 216 |
-
results["total_facilities"] = len(filtered_df)
|
| 217 |
-
|
| 218 |
-
return results
|
| 219 |
-
|
| 220 |
-
def analyze_long_term_care_capacity(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> Dict[str, Any]:
|
| 221 |
-
"""Analyze long-term care capacity based on scenario requirements"""
|
| 222 |
-
results = {}
|
| 223 |
-
|
| 224 |
-
# Get the zone with the largest percentage decrease from capacity analysis
|
| 225 |
-
if "capacity_analysis" in analysis_results:
|
| 226 |
-
capacity_data = analysis_results["capacity_analysis"]
|
| 227 |
-
|
| 228 |
-
# Find the zone with largest percentage decrease
|
| 229 |
-
max_pct_decrease = capacity_data.get("max_percentage_decrease", {})
|
| 230 |
-
|
| 231 |
-
# Extract zone name (try multiple possible keys)
|
| 232 |
-
zone_name = None
|
| 233 |
-
for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
|
| 234 |
-
if key in max_pct_decrease:
|
| 235 |
-
zone_name = max_pct_decrease[key]
|
| 236 |
-
break
|
| 237 |
-
|
| 238 |
-
if zone_name:
|
| 239 |
-
results["zone_with_largest_decrease"] = zone_name
|
| 240 |
-
|
| 241 |
-
# Get facility distribution data
|
| 242 |
-
if "facility_distribution" in analysis_results:
|
| 243 |
-
facility_data = analysis_results["facility_distribution"]
|
| 244 |
-
|
| 245 |
-
# Find the major city in this zone
|
| 246 |
-
major_city = self._find_major_city_in_zone(zone_name, facility_data, requirements)
|
| 247 |
-
|
| 248 |
-
if major_city:
|
| 249 |
-
results["major_city"] = major_city
|
| 250 |
-
|
| 251 |
-
# Analyze long-term care capacity in this city
|
| 252 |
-
city_breakdown = facility_data.get("city_facility_breakdown", {})
|
| 253 |
-
|
| 254 |
-
if major_city in city_breakdown:
|
| 255 |
-
facilities_in_city = city_breakdown[major_city]
|
| 256 |
-
|
| 257 |
-
# Count different facility types
|
| 258 |
-
hospitals = facilities_in_city.get("Hospitals", 0)
|
| 259 |
-
nursing_care = facilities_in_city.get("Nursing and residential care facilities", 0)
|
| 260 |
-
ambulatory = facilities_in_city.get("Ambulatory health care services", 0)
|
| 261 |
-
|
| 262 |
-
results["facility_counts"] = {
|
| 263 |
-
"hospitals": hospitals,
|
| 264 |
-
"nursing_residential_care": nursing_care,
|
| 265 |
-
"ambulatory": ambulatory
|
| 266 |
-
}
|
| 267 |
-
|
| 268 |
-
# Calculate ratio and assess sufficiency
|
| 269 |
-
if hospitals > 0:
|
| 270 |
-
ratio = nursing_care / hospitals
|
| 271 |
-
results["nursing_to_hospital_ratio"] = ratio
|
| 272 |
-
|
| 273 |
-
# Assess capacity
|
| 274 |
-
if ratio >= 1.5:
|
| 275 |
-
results["capacity_assessment"] = "sufficient"
|
| 276 |
-
else:
|
| 277 |
-
results["capacity_assessment"] = "insufficient"
|
| 278 |
-
else:
|
| 279 |
-
results["capacity_assessment"] = "insufficient (no hospitals)"
|
| 280 |
-
|
| 281 |
-
return results
|
| 282 |
-
|
| 283 |
-
def _find_major_city_in_zone(self, zone_name: str, facility_data: Dict[str, Any], requirements: Dict[str, Any]) -> Optional[str]:
|
| 284 |
-
"""Find the major city in a given zone"""
|
| 285 |
-
# This is a simplified approach - in a real implementation, you would need
|
| 286 |
-
# zone-to-city mapping data or more sophisticated geospatial analysis
|
| 287 |
-
|
| 288 |
-
# For now, we'll use the city with the most facilities as the major city
|
| 289 |
-
top_cities = facility_data.get("top_cities", [])
|
| 290 |
-
|
| 291 |
-
if top_cities:
|
| 292 |
-
# In a real implementation, you would check which city belongs to the zone
|
| 293 |
-
# For now, we'll return the first city as a placeholder
|
| 294 |
-
return top_cities[0]
|
| 295 |
-
|
| 296 |
-
return None
|
| 297 |
-
|
| 298 |
-
def generate_operational_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
|
| 299 |
-
"""Generate comprehensive operational recommendations"""
|
| 300 |
-
recommendations = []
|
| 301 |
-
geographic_scope = requirements.get("geographic_scope", "the region")
|
| 302 |
-
|
| 303 |
-
# Capacity-related recommendations
|
| 304 |
-
if "capacity_analysis" in analysis_results:
|
| 305 |
-
capacity = analysis_results["capacity_analysis"]
|
| 306 |
-
|
| 307 |
-
# Low utilization recommendations
|
| 308 |
-
if "average_utilization" in capacity and capacity["average_utilization"] < 0.7:
|
| 309 |
-
recommendations.append({
|
| 310 |
-
"title": "Optimize Underutilized Capacity",
|
| 311 |
-
"description": f"Average utilization is {capacity['average_utilization']:.1%} in {geographic_scope}. Consider repurposing underutilized facilities or consolidating services.",
|
| 312 |
-
"priority": "Medium",
|
| 313 |
-
"data_source": "Capacity utilization analysis"
|
| 314 |
-
})
|
| 315 |
-
|
| 316 |
-
# Capacity growth recommendations
|
| 317 |
-
if "capacity_growth_rate" in capacity and capacity["capacity_growth_rate"] < 2:
|
| 318 |
-
recommendations.append({
|
| 319 |
-
"title": "Expand Capacity Strategically",
|
| 320 |
-
"description": f"Capacity growth rate is only {capacity['capacity_growth_rate']:.1f}% in {geographic_scope}. Invest in new facilities or expand existing ones to meet demand.",
|
| 321 |
-
"priority": "High",
|
| 322 |
-
"data_source": "Capacity trend analysis"
|
| 323 |
-
})
|
| 324 |
-
|
| 325 |
-
# Zone-specific recommendations
|
| 326 |
-
if "max_percentage_decrease" in capacity and isinstance(capacity["max_percentage_decrease"], dict):
|
| 327 |
-
zone_name = "a zone"
|
| 328 |
-
for key in ["zone", "Zone", "ZONE", "region", "Region", "REGION"]:
|
| 329 |
-
if key in capacity["max_percentage_decrease"]:
|
| 330 |
-
zone_name = capacity["max_percentage_decrease"][key]
|
| 331 |
-
break
|
| 332 |
-
|
| 333 |
-
decrease = capacity["max_percentage_decrease"].get("percent_change", 0)
|
| 334 |
-
|
| 335 |
-
if zone_name and decrease:
|
| 336 |
-
recommendations.append({
|
| 337 |
-
"title": f"Address Capacity Decline in {zone_name}",
|
| 338 |
-
"description": f"{zone_name} shows a {decrease:.1f}% decrease in bed capacity. Investigate causes and implement recovery strategies.",
|
| 339 |
-
"priority": "High",
|
| 340 |
-
"data_source": "Zone capacity analysis"
|
| 341 |
-
})
|
| 342 |
-
|
| 343 |
-
# Long-term care recommendations
|
| 344 |
-
if "long_term_care_assessment" in analysis_results:
|
| 345 |
-
ltc_data = analysis_results["long_term_care_assessment"]
|
| 346 |
-
|
| 347 |
-
if ltc_data.get("capacity_assessment") == "insufficient":
|
| 348 |
-
major_city = ltc_data.get("major_city", "the major city")
|
| 349 |
-
ratio = ltc_data.get("nursing_to_hospital_ratio", 0)
|
| 350 |
-
|
| 351 |
-
recommendations.append({
|
| 352 |
-
"title": f"Expand Long-Term Care Capacity in {major_city}",
|
| 353 |
-
"description": f"Nursing/residential care to hospital ratio is {ratio:.2f} in {major_city}, which is insufficient. Invest in new long-term care beds or repurpose existing facilities.",
|
| 354 |
-
"priority": "High",
|
| 355 |
-
"data_source": "Long-term care capacity assessment"
|
| 356 |
-
})
|
| 357 |
-
|
| 358 |
-
# Resource allocation recommendations
|
| 359 |
-
if "resource_allocation" in analysis_results:
|
| 360 |
-
resources = analysis_results["resource_allocation"]
|
| 361 |
-
|
| 362 |
-
if "staff_per_bed_ratio" in resources and resources["staff_per_bed_ratio"] < 1.5:
|
| 363 |
-
recommendations.append({
|
| 364 |
-
"title": "Increase Staffing Levels",
|
| 365 |
-
"description": f"Staff per bed ratio is {resources['staff_per_bed_ratio']:.2f} in {geographic_scope}, which may be insufficient. Consider hiring additional staff.",
|
| 366 |
-
"priority": "High",
|
| 367 |
-
"data_source": "Resource allocation analysis"
|
| 368 |
-
})
|
| 369 |
-
|
| 370 |
-
# Ensure we have at least 3 recommendations as required
|
| 371 |
-
while len(recommendations) < 3:
|
| 372 |
-
recommendations.append({
|
| 373 |
-
"title": "Implement Comprehensive Capacity Management",
|
| 374 |
-
"description": "Develop a comprehensive capacity management system that includes real-time monitoring, predictive analytics, and dynamic resource allocation.",
|
| 375 |
-
"priority": "Medium",
|
| 376 |
-
"data_source": "General best practices"
|
| 377 |
-
})
|
| 378 |
-
|
| 379 |
-
# Sort by priority
|
| 380 |
-
priority_order = {"High": 0, "Medium": 1, "Low": 2}
|
| 381 |
-
recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3))
|
| 382 |
-
|
| 383 |
-
return recommendations
|
| 384 |
-
|
| 385 |
-
def validate_analysis_completeness(self, required_tasks: List[str], completed_tasks: List[str]) -> Dict[str, Any]:
|
| 386 |
-
"""Validate that all required tasks were completed"""
|
| 387 |
-
validation = {
|
| 388 |
-
"all_tasks_completed": True,
|
| 389 |
-
"missing_tasks": [],
|
| 390 |
-
"completion_rate": len(completed_tasks) / len(required_tasks) if required_tasks else 0
|
| 391 |
-
}
|
| 392 |
-
|
| 393 |
-
for task in required_tasks:
|
| 394 |
-
if task not in completed_tasks:
|
| 395 |
-
validation["all_tasks_completed"] = False
|
| 396 |
-
validation["missing_tasks"].append(task)
|
| 397 |
-
|
| 398 |
-
return validation
|
| 399 |
-
|
| 400 |
-
def analyze_facility_distribution(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
|
| 401 |
-
"""Enhanced facility distribution analysis"""
|
| 402 |
-
results = {}
|
| 403 |
-
geographic_scope = requirements.get("geographic_scope", "Unknown")
|
| 404 |
-
regions = requirements.get("regions", [])
|
| 405 |
-
|
| 406 |
-
for data_name in relevant_data:
|
| 407 |
-
df = self.data_registry.get(data_name)
|
| 408 |
-
if df is None or df.empty:
|
| 409 |
-
continue
|
| 410 |
-
|
| 411 |
-
# Filter data based on geographic scope
|
| 412 |
-
filtered_df = self._filter_by_geography(df, geographic_scope, regions)
|
| 413 |
-
|
| 414 |
-
if filtered_df.empty:
|
| 415 |
-
continue
|
| 416 |
-
|
| 417 |
-
# Facility type distribution
|
| 418 |
-
type_col = self._find_column(filtered_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'])
|
| 419 |
-
if type_col:
|
| 420 |
-
# Ensure we're working with string data
|
| 421 |
-
filtered_df[type_col] = filtered_df[type_col].astype(str)
|
| 422 |
-
type_dist = filtered_df[type_col].value_counts().to_dict()
|
| 423 |
-
results["facility_type_distribution"] = type_dist
|
| 424 |
-
|
| 425 |
-
# Calculate diversity index
|
| 426 |
-
diversity = self._calculate_diversity_index(type_dist)
|
| 427 |
-
results["facility_diversity"] = diversity
|
| 428 |
-
|
| 429 |
-
# Geographic distribution
|
| 430 |
-
geo_col = self._find_column(filtered_df, ['province', 'state', 'region', 'zone', 'area'])
|
| 431 |
-
if geo_col:
|
| 432 |
-
# Ensure we're working with string data
|
| 433 |
-
filtered_df[geo_col] = filtered_df[geo_col].astype(str)
|
| 434 |
-
geo_dist = filtered_df[geo_col].value_counts().to_dict()
|
| 435 |
-
results["geographic_distribution"] = geo_dist
|
| 436 |
-
|
| 437 |
-
# Calculate Gini coefficient for inequality
|
| 438 |
-
gini = self._calculate_gini(list(geo_dist.values()))
|
| 439 |
-
results["geographic_inequality"] = gini
|
| 440 |
-
|
| 441 |
-
# City distribution
|
| 442 |
-
city_col = self._find_column(filtered_df, ['city', 'municipality', 'town'])
|
| 443 |
-
if city_col:
|
| 444 |
-
# Ensure we're working with string data
|
| 445 |
-
filtered_df[city_col] = filtered_df[city_col].astype(str)
|
| 446 |
-
city_counts = filtered_df[city_col].value_counts().head(5)
|
| 447 |
-
top_cities = city_counts.index.tolist()
|
| 448 |
-
|
| 449 |
-
# Breakdown by facility type for top cities
|
| 450 |
-
city_breakdown = {}
|
| 451 |
-
for city in top_cities:
|
| 452 |
-
city_data = filtered_df[filtered_df[city_col] == city]
|
| 453 |
-
if not city_data.empty and type_col in city_data.columns:
|
| 454 |
-
city_breakdown[city] = city_data[type_col].value_counts().to_dict()
|
| 455 |
-
|
| 456 |
-
results["top_cities"] = top_cities
|
| 457 |
-
results["city_breakdown"] = city_breakdown
|
| 458 |
-
|
| 459 |
-
# Total facilities count
|
| 460 |
-
results["total_facilities"] = len(filtered_df)
|
| 461 |
-
|
| 462 |
-
return results
|
| 463 |
-
|
| 464 |
-
def analyze_capacity(self, relevant_data: List[str], requirements: Dict[str, Any]) -> Dict[str, Any]:
|
| 465 |
-
"""Enhanced capacity analysis"""
|
| 466 |
-
results = {}
|
| 467 |
-
geographic_scope = requirements.get("geographic_scope", "Unknown")
|
| 468 |
-
regions = requirements.get("regions", [])
|
| 469 |
-
|
| 470 |
-
for data_name in relevant_data:
|
| 471 |
-
df = self.data_registry.get(data_name)
|
| 472 |
-
if df is None or df.empty:
|
| 473 |
-
continue
|
| 474 |
-
|
| 475 |
-
# Filter data based on geographic scope
|
| 476 |
-
filtered_df = self._filter_by_geography(df, geographic_scope, regions)
|
| 477 |
-
|
| 478 |
-
if filtered_df.empty:
|
| 479 |
-
continue
|
| 480 |
-
|
| 481 |
-
# Current capacity
|
| 482 |
-
capacity_col = self._find_column(filtered_df, ['capacity', 'beds', 'current_capacity', 'beds_current'])
|
| 483 |
-
if capacity_col:
|
| 484 |
-
# Ensure we're working with numeric data
|
| 485 |
-
filtered_df[capacity_col] = pd.to_numeric(filtered_df[capacity_col], errors='coerce')
|
| 486 |
-
total_capacity = filtered_df[capacity_col].sum()
|
| 487 |
-
results["total_capacity"] = total_capacity
|
| 488 |
-
|
| 489 |
-
# Capacity by facility type
|
| 490 |
-
type_col = self._find_column(filtered_df, ['type', 'facility_type'])
|
| 491 |
-
if type_col and type_col in filtered_df.columns:
|
| 492 |
-
capacity_by_type = filtered_df.groupby(type_col)[capacity_col].sum().to_dict()
|
| 493 |
-
results["capacity_by_type"] = capacity_by_type
|
| 494 |
-
|
| 495 |
-
# Capacity utilization
|
| 496 |
-
utilization_col = self._find_column(filtered_df, ['utilization', 'occupancy', 'occupancy_rate'])
|
| 497 |
-
if utilization_col:
|
| 498 |
-
# Ensure we're working with numeric data
|
| 499 |
-
filtered_df[utilization_col] = pd.to_numeric(filtered_df[utilization_col], errors='coerce')
|
| 500 |
-
avg_utilization = filtered_df[utilization_col].mean()
|
| 501 |
-
results["average_utilization"] = avg_utilization
|
| 502 |
-
|
| 503 |
-
# Utilization by facility type
|
| 504 |
-
if type_col and type_col in filtered_df.columns:
|
| 505 |
-
utilization_by_type = filtered_df.groupby(type_col)[utilization_col].mean().to_dict()
|
| 506 |
-
results["utilization_by_type"] = utilization_by_type
|
| 507 |
-
|
| 508 |
-
# Capacity trends
|
| 509 |
-
time_cols = [col for col in filtered_df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
|
| 510 |
-
if len(time_cols) >= 2:
|
| 511 |
-
trend_data = {}
|
| 512 |
-
for col in time_cols:
|
| 513 |
-
# Ensure we're working with numeric data
|
| 514 |
-
filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')
|
| 515 |
-
trend_data[col] = filtered_df[col].sum()
|
| 516 |
-
results["capacity_trends"] = trend_data
|
| 517 |
-
|
| 518 |
-
# Calculate growth rate
|
| 519 |
-
if len(time_cols) >= 2:
|
| 520 |
-
latest = time_cols[-1]
|
| 521 |
-
earliest = time_cols[0]
|
| 522 |
-
if trend_data[earliest] > 0: # Avoid division by zero
|
| 523 |
-
growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
|
| 524 |
-
results["capacity_growth_rate"] = growth_rate
|
| 525 |
-
|
| 526 |
-
# Bed change analysis
|
| 527 |
-
prev_col = self._find_column(filtered_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'])
|
| 528 |
-
current_col = self._find_column(filtered_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'])
|
| 529 |
-
|
| 530 |
-
if prev_col and current_col:
|
| 531 |
-
# Ensure we're working with numeric data
|
| 532 |
-
filtered_df[prev_col] = pd.to_numeric(filtered_df[prev_col], errors='coerce')
|
| 533 |
-
filtered_df[current_col] = pd.to_numeric(filtered_df[current_col], errors='coerce')
|
| 534 |
-
|
| 535 |
-
# Calculate bed change
|
| 536 |
-
filtered_df['bed_change'] = filtered_df[current_col] - filtered_df[prev_col]
|
| 537 |
-
|
| 538 |
-
# Calculate percentage change
|
| 539 |
-
filtered_df['percent_change'] = filtered_df.apply(
|
| 540 |
-
lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
|
| 541 |
-
axis=1
|
| 542 |
-
)
|
| 543 |
-
|
| 544 |
-
# Zone/Region-level analysis
|
| 545 |
-
zone_col = self._find_column(filtered_df, ['zone', 'region', 'area', 'district'])
|
| 546 |
-
if zone_col:
|
| 547 |
-
# Ensure we're working with string data
|
| 548 |
-
filtered_df[zone_col] = filtered_df[zone_col].astype(str)
|
| 549 |
-
|
| 550 |
-
zone_summary = filtered_df.groupby(zone_col).agg({
|
| 551 |
-
current_col: 'sum',
|
| 552 |
-
prev_col: 'sum',
|
| 553 |
-
'bed_change': 'sum'
|
| 554 |
-
}).reset_index()
|
| 555 |
-
|
| 556 |
-
zone_summary['percent_change'] = zone_summary.apply(
|
| 557 |
-
lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
|
| 558 |
-
axis=1
|
| 559 |
-
)
|
| 560 |
-
|
| 561 |
-
results["zone_summary"] = zone_summary.to_dict('records')
|
| 562 |
-
|
| 563 |
-
# Find zones with largest changes
|
| 564 |
-
if not zone_summary.empty:
|
| 565 |
-
# Get zone with largest absolute decrease
|
| 566 |
-
if zone_summary['bed_change'].notna().any():
|
| 567 |
-
max_abs_decrease_idx = zone_summary['bed_change'].idxmin()
|
| 568 |
-
max_abs_decrease = zone_summary.loc[max_abs_decrease_idx]
|
| 569 |
-
results["max_absolute_decrease"] = max_abs_decrease.to_dict()
|
| 570 |
-
|
| 571 |
-
# Get zone with largest percentage decrease
|
| 572 |
-
if zone_summary['percent_change'].notna().any():
|
| 573 |
-
max_pct_decrease_idx = zone_summary['percent_change'].idxmin()
|
| 574 |
-
max_pct_decrease = zone_summary.loc[max_pct_decrease_idx]
|
| 575 |
-
results["max_percentage_decrease"] = max_pct_decrease.to_dict()
|
| 576 |
-
|
| 577 |
-
# Identify facilities with largest declines
|
| 578 |
-
facilities_decline = filtered_df.sort_values('bed_change').head(5)
|
| 579 |
-
if not facilities_decline.empty:
|
| 580 |
-
results["facilities_with_largest_declines"] = facilities_decline.to_dict('records')
|
| 581 |
-
|
| 582 |
-
return results
|
| 583 |
-
|
| 584 |
-
def _filter_by_geography(self, df: pd.DataFrame, geographic_scope: str, regions: List[str]) -> pd.DataFrame:
|
| 585 |
-
"""Filter dataframe based on geographic scope and regions"""
|
| 586 |
-
if geographic_scope == "Unknown" and not regions:
|
| 587 |
-
return df.copy()
|
| 588 |
-
|
| 589 |
-
# Try to find a geographic column
|
| 590 |
-
geo_col = self._find_column(df, ['province', 'state', 'region', 'zone', 'area', 'district'])
|
| 591 |
-
|
| 592 |
-
if geo_col is None:
|
| 593 |
-
return df.copy()
|
| 594 |
-
|
| 595 |
-
# Ensure we're working with string data
|
| 596 |
-
try:
|
| 597 |
-
df[geo_col] = df[geo_col].astype(str)
|
| 598 |
-
except Exception as e:
|
| 599 |
-
logger.warning(f"Error converting column {geo_col} to string: {str(e)}")
|
| 600 |
-
return df.copy()
|
| 601 |
-
|
| 602 |
-
# Create filters
|
| 603 |
-
filters = []
|
| 604 |
-
|
| 605 |
-
# Add geographic scope filter
|
| 606 |
-
if geographic_scope != "Unknown":
|
| 607 |
-
# Create a list of possible values for the geographic scope
|
| 608 |
-
scope_values = [geographic_scope.lower()]
|
| 609 |
-
|
| 610 |
-
# Add common abbreviations
|
| 611 |
-
abbreviations = {
|
| 612 |
-
# Canadian provinces
|
| 613 |
-
"alberta": "ab", "british columbia": "bc", "ontario": "on", "quebec": "qc",
|
| 614 |
-
"manitoba": "mb", "saskatchewan": "sk", "nova scotia": "ns", "new brunswick": "nb",
|
| 615 |
-
"prince edward island": "pe", "newfoundland": "nl", "yukon": "yt",
|
| 616 |
-
"northwest territories": "nt", "nunavut": "nu",
|
| 617 |
-
# US states
|
| 618 |
-
"alabama": "al", "alaska": "ak", "arizona": "az", "arkansas": "ar",
|
| 619 |
-
"california": "ca", "colorado": "co", "connecticut": "ct", "delaware": "de",
|
| 620 |
-
"florida": "fl", "georgia": "ga", "hawaii": "hi", "idaho": "id",
|
| 621 |
-
"illinois": "il", "indiana": "in", "iowa": "ia", "kansas": "ks",
|
| 622 |
-
"kentucky": "ky", "louisiana": "la", "maine": "me", "maryland": "md",
|
| 623 |
-
"massachusetts": "ma", "michigan": "mi", "minnesota": "mn", "mississippi": "ms",
|
| 624 |
-
"missouri": "mo", "montana": "mt", "nebraska": "ne", "nevada": "nv",
|
| 625 |
-
"new hampshire": "nh", "new jersey": "nj", "new mexico": "nm", "new york": "ny",
|
| 626 |
-
"north carolina": "nc", "north dakota": "nd", "ohio": "oh", "oklahoma": "ok",
|
| 627 |
-
"oregon": "or", "pennsylvania": "pa", "rhode island": "ri", "south carolina": "sc",
|
| 628 |
-
"south dakota": "sd", "tennessee": "tn", "texas": "tx", "utah": "ut",
|
| 629 |
-
"vermont": "vt", "virginia": "va", "washington": "wa", "west virginia": "wv",
|
| 630 |
-
"wisconsin": "wi", "wyoming": "wy"
|
| 631 |
-
}
|
| 632 |
-
|
| 633 |
-
if geographic_scope.lower() in abbreviations:
|
| 634 |
-
scope_values.append(abbreviations[geographic_scope.lower()])
|
| 635 |
-
|
| 636 |
-
try:
|
| 637 |
-
scope_filter = df[geo_col].str.lower().isin(scope_values)
|
| 638 |
-
filters.append(scope_filter)
|
| 639 |
-
except Exception as e:
|
| 640 |
-
logger.warning(f"Error creating scope filter: {str(e)}")
|
| 641 |
-
|
| 642 |
-
# Add region filters
|
| 643 |
-
if regions:
|
| 644 |
-
try:
|
| 645 |
-
region_filter = df[geo_col].str.lower().isin([r.lower() for r in regions])
|
| 646 |
-
filters.append(region_filter)
|
| 647 |
-
except Exception as e:
|
| 648 |
-
logger.warning(f"Error creating region filter: {str(e)}")
|
| 649 |
-
|
| 650 |
-
# Apply filters
|
| 651 |
-
if filters:
|
| 652 |
-
try:
|
| 653 |
-
combined_filter = filters[0]
|
| 654 |
-
for f in filters[1:]:
|
| 655 |
-
combined_filter = combined_filter | f
|
| 656 |
-
|
| 657 |
-
return df[combined_filter].copy()
|
| 658 |
-
except Exception as e:
|
| 659 |
-
logger.warning(f"Error applying filters: {str(e)}")
|
| 660 |
-
|
| 661 |
-
return df.copy()
|
| 662 |
-
|
| 663 |
-
def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
|
| 664 |
-
"""Analyze resource allocation patterns"""
|
| 665 |
-
results = {}
|
| 666 |
-
|
| 667 |
-
for data_name in relevant_data:
|
| 668 |
-
df = self.data_registry.get(data_name)
|
| 669 |
-
if df is None or df.empty:
|
| 670 |
-
continue
|
| 671 |
-
|
| 672 |
-
# Staff analysis
|
| 673 |
-
staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
|
| 674 |
-
if staff_col:
|
| 675 |
-
# Ensure we're working with numeric data
|
| 676 |
-
df[staff_col] = pd.to_numeric(df[staff_col], errors='coerce')
|
| 677 |
-
total_staff = df[staff_col].sum()
|
| 678 |
-
results["total_staff"] = total_staff
|
| 679 |
-
|
| 680 |
-
# Staff per bed ratio
|
| 681 |
-
capacity_col = self._find_column(df, ['capacity', 'beds'])
|
| 682 |
-
if capacity_col and capacity_col in df.columns:
|
| 683 |
-
# Ensure we're working with numeric data
|
| 684 |
-
df[capacity_col] = pd.to_numeric(df[capacity_col], errors='coerce')
|
| 685 |
-
df['staff_per_bed'] = df[staff_col] / df[capacity_col].replace(0, np.nan) # Avoid division by zero
|
| 686 |
-
avg_staff_per_bed = df['staff_per_bed'].mean()
|
| 687 |
-
results["staff_per_bed_ratio"] = avg_staff_per_bed
|
| 688 |
-
|
| 689 |
-
# Equipment analysis
|
| 690 |
-
equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
|
| 691 |
-
if equipment_cols:
|
| 692 |
-
equipment_summary = {}
|
| 693 |
-
for col in equipment_cols:
|
| 694 |
-
# Ensure we're working with numeric data
|
| 695 |
-
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 696 |
-
equipment_summary[col] = df[col].sum()
|
| 697 |
-
results["equipment_summary"] = equipment_summary
|
| 698 |
-
|
| 699 |
-
return results
|
| 700 |
-
|
| 701 |
-
def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
|
| 702 |
-
"""Analyze trends in healthcare data"""
|
| 703 |
-
results = {}
|
| 704 |
-
|
| 705 |
-
for data_name in relevant_data:
|
| 706 |
-
df = self.data_registry.get(data_name)
|
| 707 |
-
if df is None or df.empty:
|
| 708 |
-
continue
|
| 709 |
-
|
| 710 |
-
# Find time-based columns
|
| 711 |
-
time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
|
| 712 |
-
|
| 713 |
-
if len(time_cols) >= 2:
|
| 714 |
-
trends = {}
|
| 715 |
-
|
| 716 |
-
# Calculate year-over-year changes
|
| 717 |
-
for i in range(1, len(time_cols)):
|
| 718 |
-
prev_year = time_cols[i-1]
|
| 719 |
-
curr_year = time_cols[i]
|
| 720 |
-
|
| 721 |
-
# Ensure we're working with numeric data
|
| 722 |
-
df[prev_year] = pd.to_numeric(df[prev_year], errors='coerce')
|
| 723 |
-
df[curr_year] = pd.to_numeric(df[curr_year], errors='coerce')
|
| 724 |
-
|
| 725 |
-
prev_total = df[prev_year].sum()
|
| 726 |
-
curr_total = df[curr_year].sum()
|
| 727 |
-
|
| 728 |
-
if prev_total > 0: # Avoid division by zero
|
| 729 |
-
change_pct = (curr_total - prev_total) / prev_total * 100
|
| 730 |
-
trends[f"{prev_year}_to_{curr_year}"] = {
|
| 731 |
-
"absolute_change": curr_total - prev_total,
|
| 732 |
-
"percentage_change": change_pct
|
| 733 |
-
}
|
| 734 |
-
|
| 735 |
-
results["year_over_year_trends"] = trends
|
| 736 |
-
|
| 737 |
return results
|
| 738 |
-
|
| 739 |
-
def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
|
| 740 |
-
"""Identify opportunities for AI integration and data enhancement"""
|
| 741 |
-
opportunities = {
|
| 742 |
-
"data_integration": [],
|
| 743 |
-
"ai_applications": [],
|
| 744 |
-
"enhanced_metrics": []
|
| 745 |
-
}
|
| 746 |
-
|
| 747 |
-
# Data integration opportunities
|
| 748 |
-
opportunities["data_integration"].append({
|
| 749 |
-
"opportunity": "Integrate real-time occupancy data",
|
| 750 |
-
"description": "Combine current facility data with real-time occupancy monitoring systems",
|
| 751 |
-
"benefit": "Enable dynamic resource allocation and surge planning"
|
| 752 |
-
})
|
| 753 |
-
|
| 754 |
-
opportunities["data_integration"].append({
|
| 755 |
-
"opportunity": "Incorporate demographic data",
|
| 756 |
-
"description": "Add population demographics and health needs data",
|
| 757 |
-
"benefit": "Improve demand forecasting and service planning"
|
| 758 |
-
})
|
| 759 |
-
|
| 760 |
-
# AI application opportunities
|
| 761 |
-
opportunities["ai_applications"].append({
|
| 762 |
-
"opportunity": "Predictive capacity modeling",
|
| 763 |
-
"description": "Use ML to forecast capacity needs based on trends and external factors",
|
| 764 |
-
"benefit": "Proactive resource planning and reduced wait times"
|
| 765 |
-
})
|
| 766 |
-
|
| 767 |
-
opportunities["ai_applications"].append({
|
| 768 |
-
"opportunity": "Optimization algorithms",
|
| 769 |
-
"description": "Implement AI for staff scheduling and resource allocation",
|
| 770 |
-
"benefit": "Improved efficiency and reduced operational costs"
|
| 771 |
-
})
|
| 772 |
-
|
| 773 |
-
# Enhanced metrics
|
| 774 |
-
opportunities["enhanced_metrics"].append({
|
| 775 |
-
"metric": "Patient flow efficiency",
|
| 776 |
-
"description": "Measure time from admission to discharge across facilities",
|
| 777 |
-
"benefit": "Identify bottlenecks and improve patient experience"
|
| 778 |
-
})
|
| 779 |
-
|
| 780 |
-
opportunities["enhanced_metrics"].append({
|
| 781 |
-
"metric": "Resource utilization index",
|
| 782 |
-
"description": "Composite metric combining staff, equipment, and space utilization",
|
| 783 |
-
"benefit": "Holistic view of operational efficiency"
|
| 784 |
-
})
|
| 785 |
-
|
| 786 |
-
return opportunities
|
| 787 |
-
|
| 788 |
-
# Helper methods
|
| 789 |
-
def _find_column(self, df, patterns):
|
| 790 |
-
"""Find the first column matching any pattern"""
|
| 791 |
-
if df is None or df.empty:
|
| 792 |
-
return None
|
| 793 |
-
for col in df.columns:
|
| 794 |
-
if any(pattern.lower() in col.lower() for pattern in patterns):
|
| 795 |
-
return col
|
| 796 |
-
return None
|
| 797 |
-
|
| 798 |
-
def _calculate_gini(self, values):
|
| 799 |
-
"""Calculate Gini coefficient for inequality measurement"""
|
| 800 |
-
if not values or len(values) < 2:
|
| 801 |
-
return 0
|
| 802 |
-
|
| 803 |
-
values = sorted(values)
|
| 804 |
-
n = len(values)
|
| 805 |
-
index = np.arange(1, n + 1)
|
| 806 |
-
total = np.sum(values)
|
| 807 |
-
|
| 808 |
-
if total == 0:
|
| 809 |
-
return 0
|
| 810 |
-
|
| 811 |
-
gini = (np.sum((2 * index - n - 1) * values)) / (n * total)
|
| 812 |
-
return gini
|
| 813 |
-
|
| 814 |
-
def _calculate_diversity_index(self, distribution):
|
| 815 |
-
"""Calculate Shannon diversity index"""
|
| 816 |
-
if not distribution:
|
| 817 |
-
return 0
|
| 818 |
-
|
| 819 |
-
total = sum(distribution.values())
|
| 820 |
-
if total == 0:
|
| 821 |
-
return 0
|
| 822 |
-
|
| 823 |
-
proportions = [count/total for count in distribution.values() if count > 0]
|
| 824 |
-
if not proportions:
|
| 825 |
-
return 0
|
| 826 |
-
|
| 827 |
-
return -sum(p * np.log(p) for p in proportions)
|
| 828 |
-
|
| 829 |
-
def _extract_geographic_scope(self, text):
|
| 830 |
-
"""Extract geographic scope from text"""
|
| 831 |
-
# Look for province/state names
|
| 832 |
-
provinces = [
|
| 833 |
-
"alberta", "british columbia", "ontario", "quebec", "manitoba",
|
| 834 |
-
"saskatchewan", "nova scotia", "new brunswick", "prince edward island",
|
| 835 |
-
"newfoundland", "yukon", "northwest territories", "nunavut"
|
| 836 |
-
]
|
| 837 |
-
|
| 838 |
-
states = [
|
| 839 |
-
"alabama", "alaska", "arizona", "arkansas", "california", "colorado",
|
| 840 |
-
"connecticut", "delaware", "florida", "georgia", "hawaii", "idaho",
|
| 841 |
-
"illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana",
|
| 842 |
-
"maine", "maryland", "massachusetts", "michigan", "minnesota",
|
| 843 |
-
"mississippi", "missouri", "montana", "nebraska", "nevada",
|
| 844 |
-
"new hampshire", "new jersey", "new mexico", "new york",
|
| 845 |
-
"north carolina", "north dakota", "ohio", "oklahoma", "oregon",
|
| 846 |
-
"pennsylvania", "rhode island", "south carolina", "south dakota",
|
| 847 |
-
"tennessee", "texas", "utah", "vermont", "virginia", "washington",
|
| 848 |
-
"west virginia", "wisconsin", "wyoming"
|
| 849 |
-
]
|
| 850 |
-
|
| 851 |
-
text_lower = text.lower()
|
| 852 |
-
|
| 853 |
-
# Check for provinces
|
| 854 |
-
for province in provinces:
|
| 855 |
-
if province in text_lower:
|
| 856 |
-
return province.title()
|
| 857 |
-
|
| 858 |
-
# Check for states
|
| 859 |
-
for state in states:
|
| 860 |
-
if state in text_lower:
|
| 861 |
-
return state.title()
|
| 862 |
-
|
| 863 |
-
# Check for countries
|
| 864 |
-
if "canada" in text_lower:
|
| 865 |
-
return "Canada"
|
| 866 |
-
if "usa" in text_lower or "united states" in text_lower:
|
| 867 |
-
return "United States"
|
| 868 |
-
|
| 869 |
-
return "Unknown"
|
| 870 |
-
|
| 871 |
-
def _extract_time_period(self, text):
|
| 872 |
-
"""Extract time period from text"""
|
| 873 |
-
# Look for year patterns
|
| 874 |
-
years = re.findall(r'\b(20\d{2})\b', text)
|
| 875 |
-
if len(years) >= 2:
|
| 876 |
-
return f"{min(years)}-{max(years)}"
|
| 877 |
-
return "Unknown"
|
| 878 |
-
|
| 879 |
-
def _extract_facility_types(self, text):
|
| 880 |
-
"""Extract facility types from text"""
|
| 881 |
-
types = []
|
| 882 |
-
if "hospital" in text.lower():
|
| 883 |
-
types.append("Hospitals")
|
| 884 |
-
if "nursing" in text.lower() or "long-term" in text.lower():
|
| 885 |
-
types.append("Nursing homes")
|
| 886 |
-
if "clinic" in text.lower():
|
| 887 |
-
types.append("Clinics")
|
| 888 |
-
return types
|
| 889 |
-
|
| 890 |
-
def _extract_metrics(self, text):
|
| 891 |
-
"""Extract required metrics from text"""
|
| 892 |
-
metrics = []
|
| 893 |
-
if "bed" in text.lower():
|
| 894 |
-
metrics.append("Bed capacity")
|
| 895 |
-
if "occupancy" in text.lower():
|
| 896 |
-
metrics.append("Occupancy rates")
|
| 897 |
-
if "staff" in text.lower():
|
| 898 |
-
metrics.append("Staffing levels")
|
| 899 |
-
return metrics
|
| 900 |
-
|
| 901 |
-
def _extract_regions(self, text):
|
| 902 |
-
"""Extract specific regions mentioned in the scenario"""
|
| 903 |
-
# Look for region names in the scenario
|
| 904 |
-
regions = []
|
| 905 |
-
|
| 906 |
-
# Common region patterns - this could be expanded
|
| 907 |
-
region_patterns = [
|
| 908 |
-
r'([A-Z][a-z]+ (Zone|Region|Area|District))',
|
| 909 |
-
r'(North|South|East|West|Central)',
|
| 910 |
-
r'([A-Z][a-z]+ (City|County|State|Province))',
|
| 911 |
-
r'([A-Z][a-z]+)'
|
| 912 |
-
]
|
| 913 |
-
|
| 914 |
-
for pattern in region_patterns:
|
| 915 |
-
matches = re.findall(pattern, text)
|
| 916 |
-
for match in matches:
|
| 917 |
-
if isinstance(match, tuple):
|
| 918 |
-
regions.append(match[0])
|
| 919 |
-
else:
|
| 920 |
-
regions.append(match)
|
| 921 |
-
|
| 922 |
-
# Remove duplicates while preserving order
|
| 923 |
-
seen = set()
|
| 924 |
-
unique_regions = [r for r in regions if not (r in seen or seen.add(r))]
|
| 925 |
-
|
| 926 |
-
return unique_regions
|
| 927 |
-
|
| 928 |
-
def _identify_relevant_data(self, text):
|
| 929 |
-
"""Identify relevant datasets for the scenario"""
|
| 930 |
-
# Use data registry's find_related_datasets method
|
| 931 |
-
keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
|
| 932 |
-
return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]
|
|
|
|
| 1 |
# healthcare_analysis.py
|
| 2 |
import pandas as pd
|
| 3 |
+
from data_registry import DataRegistry
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
class HealthcareAnalyzer:
|
| 6 |
+
def __init__(self, registry: DataRegistry):
|
| 7 |
+
self.registry = registry
|
| 8 |
+
|
| 9 |
+
def comprehensive_analysis(self, scenario: str) -> dict:
|
| 10 |
+
results={}
|
| 11 |
+
for name in self.registry.names():
|
| 12 |
+
results[name]=self.registry.get(name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|