Medica_DecisionSupportAI / healthcare_analysis.py
Rajan Sharma
Create healthcare_analysis.py
379e195 verified
raw
history blame
17.6 kB
# healthcare_analysis.py
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class HealthcareAnalyzer:
def __init__(self, data_registry):
self.data_registry = data_registry
self.analysis_results = {}
def comprehensive_analysis(self, scenario_text: str) -> Dict[str, Any]:
"""Perform comprehensive healthcare scenario analysis"""
logger.info("Starting comprehensive healthcare analysis")
# Extract tasks and requirements
tasks = self._extract_tasks(scenario_text)
requirements = self._extract_requirements(scenario_text)
# Identify relevant datasets
relevant_data = self._identify_relevant_data(scenario_text)
# Perform analyses based on tasks
results = {}
if "facility_distribution" in tasks:
results["facility_distribution"] = self.analyze_facility_distribution(relevant_data)
if "capacity_analysis" in tasks:
results["capacity_analysis"] = self.analyze_capacity(relevant_data)
if "resource_allocation" in tasks:
results["resource_allocation"] = self.analyze_resource_allocation(relevant_data)
if "trends" in tasks:
results["trends"] = self.analyze_trends(relevant_data)
# Generate recommendations
results["recommendations"] = self.generate_recommendations(results, requirements)
# Future integration opportunities
results["future_integration"] = self.identify_integration_opportunities(results)
logger.info("Comprehensive analysis completed")
return results
def _extract_tasks(self, scenario_text: str) -> List[str]:
"""Extract specific tasks from scenario text"""
tasks = []
task_keywords = {
"facility_distribution": ["facility", "distribution", "location", "sites"],
"capacity_analysis": ["capacity", "beds", "occupancy", "utilization"],
"resource_allocation": ["resource", "allocation", "staffing", "equipment"],
"trends": ["trend", "change", "growth", "decline", "pattern"]
}
for task_type, keywords in task_keywords.items():
if any(kw in scenario_text.lower() for kw in keywords):
tasks.append(task_type)
return tasks
def _extract_requirements(self, scenario_text: str) -> Dict[str, Any]:
"""Extract specific requirements from scenario text"""
return {
"geographic_scope": self._extract_geographic_scope(scenario_text),
"time_period": self._extract_time_period(scenario_text),
"facility_types": self._extract_facility_types(scenario_text),
"metrics_needed": self._extract_metrics(scenario_text)
}
def analyze_facility_distribution(self, relevant_data: List[str]) -> Dict[str, Any]:
"""Enhanced facility distribution analysis"""
results = {}
for data_name in relevant_data:
df = self.data_registry.get(data_name)
if df is None:
continue
# Geographic distribution
geo_col = self._find_column(df, ['province', 'state', 'region', 'zone'])
if geo_col:
geo_dist = df[geo_col].value_counts().to_dict()
results["geographic_distribution"] = geo_dist
# Calculate Gini coefficient for inequality
gini = self._calculate_gini(list(geo_dist.values()))
results["geographic_inequality"] = gini
# Facility type distribution
type_col = self._find_column(df, ['type', 'category', 'facility_type'])
if type_col:
type_dist = df[type_col].value_counts().to_dict()
results["facility_type_distribution"] = type_dist
# Calculate diversity index
diversity = self._calculate_diversity_index(type_dist)
results["facility_diversity"] = diversity
# Urban vs rural distribution
urban_col = self._find_column(df, ['urban', 'rural', 'location_type'])
if urban_col:
urban_rural = df[urban_col].value_counts().to_dict()
results["urban_rural_distribution"] = urban_rural
return results
def analyze_capacity(self, relevant_data: List[str]) -> Dict[str, Any]:
"""Enhanced capacity analysis"""
results = {}
for data_name in relevant_data:
df = self.data_registry.get(data_name)
if df is None:
continue
# Current capacity
capacity_col = self._find_column(df, ['capacity', 'beds', 'current_capacity'])
if capacity_col:
total_capacity = df[capacity_col].sum()
results["total_capacity"] = total_capacity
# Capacity by facility type
type_col = self._find_column(df, ['type', 'facility_type'])
if type_col:
capacity_by_type = df.groupby(type_col)[capacity_col].sum().to_dict()
results["capacity_by_type"] = capacity_by_type
# Capacity utilization
utilization_col = self._find_column(df, ['utilization', 'occupancy', 'occupancy_rate'])
if utilization_col:
avg_utilization = df[utilization_col].mean()
results["average_utilization"] = avg_utilization
# Utilization by facility type
if type_col:
utilization_by_type = df.groupby(type_col)[utilization_col].mean().to_dict()
results["utilization_by_type"] = utilization_by_type
# Capacity trends
time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
if len(time_cols) >= 2:
trend_data = {}
for col in time_cols:
trend_data[col] = df[col].sum()
results["capacity_trends"] = trend_data
# Calculate growth rate
if len(time_cols) >= 2:
latest = time_cols[-1]
earliest = time_cols[0]
growth_rate = (trend_data[latest] - trend_data[earliest]) / trend_data[earliest] * 100
results["capacity_growth_rate"] = growth_rate
return results
def analyze_resource_allocation(self, relevant_data: List[str]) -> Dict[str, Any]:
"""Analyze resource allocation patterns"""
results = {}
for data_name in relevant_data:
df = self.data_registry.get(data_name)
if df is None:
continue
# Staff analysis
staff_col = self._find_column(df, ['staff', 'employees', 'fte'])
if staff_col:
total_staff = df[staff_col].sum()
results["total_staff"] = total_staff
# Staff per bed ratio
capacity_col = self._find_column(df, ['capacity', 'beds'])
if capacity_col:
df['staff_per_bed'] = df[staff_col] / df[capacity_col]
avg_staff_per_bed = df['staff_per_bed'].mean()
results["staff_per_bed_ratio"] = avg_staff_per_bed
# Equipment analysis
equipment_cols = [col for col in df.columns if 'equipment' in col.lower()]
if equipment_cols:
equipment_summary = {}
for col in equipment_cols:
equipment_summary[col] = df[col].sum()
results["equipment_summary"] = equipment_summary
return results
def analyze_trends(self, relevant_data: List[str]) -> Dict[str, Any]:
"""Analyze trends in healthcare data"""
results = {}
for data_name in relevant_data:
df = self.data_registry.get(data_name)
if df is None:
continue
# Find time-based columns
time_cols = [col for col in df.columns if any(year in col.lower() for year in ['2020', '2021', '2022', '2023', '2024'])]
if len(time_cols) >= 2:
trends = {}
# Calculate year-over-year changes
for i in range(1, len(time_cols)):
prev_year = time_cols[i-1]
curr_year = time_cols[i]
prev_total = df[prev_year].sum()
curr_total = df[curr_year].sum()
if prev_total > 0:
change_pct = (curr_total - prev_total) / prev_total * 100
trends[f"{prev_year}_to_{curr_year}"] = {
"absolute_change": curr_total - prev_total,
"percentage_change": change_pct
}
results["year_over_year_trends"] = trends
return results
def generate_recommendations(self, analysis_results: Dict[str, Any], requirements: Dict[str, Any]) -> List[Dict[str, str]]:
"""Generate data-driven operational recommendations"""
recommendations = []
# Capacity-related recommendations
if "capacity_analysis" in analysis_results:
capacity = analysis_results["capacity_analysis"]
# Low utilization recommendations
if "average_utilization" in capacity and capacity["average_utilization"] < 0.7:
recommendations.append({
"title": "Optimize Underutilized Capacity",
"description": f"Average utilization is {capacity['average_utilization']:.1%}. Consider repurposing underutilized facilities or consolidating services.",
"priority": "Medium",
"data_source": "Capacity utilization analysis"
})
# Capacity growth recommendations
if "capacity_growth_rate" in capacity and capacity["capacity_growth_rate"] < 2:
recommendations.append({
"title": "Expand Capacity Strategically",
"description": f"Capacity growth rate is only {capacity['capacity_growth_rate']:.1f}%. Invest in new facilities or expand existing ones to meet demand.",
"priority": "High",
"data_source": "Capacity trend analysis"
})
# Geographic distribution recommendations
if "facility_distribution" in analysis_results:
dist = analysis_results["facility_distribution"]
if "geographic_inequality" in dist and dist["geographic_inequality"] > 0.4:
recommendations.append({
"title": "Address Geographic Inequity",
"description": f"High geographic inequality (Gini: {dist['geographic_inequality']:.2f}). Consider targeted investments in underserved areas.",
"priority": "High",
"data_source": "Geographic distribution analysis"
})
# Resource allocation recommendations
if "resource_allocation" in analysis_results:
resources = analysis_results["resource_allocation"]
if "staff_per_bed_ratio" in resources and resources["staff_per_bed_ratio"] < 1.5:
recommendations.append({
"title": "Increase Staffing Levels",
"description": f"Staff per bed ratio is {resources['staff_per_bed_ratio']:.2f}, which may be insufficient. Consider hiring additional staff.",
"priority": "High",
"data_source": "Resource allocation analysis"
})
# Sort by priority
priority_order = {"High": 0, "Medium": 1, "Low": 2}
recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3))
return recommendations
def identify_integration_opportunities(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
"""Identify opportunities for AI integration and data enhancement"""
opportunities = {
"data_integration": [],
"ai_applications": [],
"enhanced_metrics": []
}
# Data integration opportunities
opportunities["data_integration"].append({
"opportunity": "Integrate real-time occupancy data",
"description": "Combine current facility data with real-time occupancy monitoring systems",
"benefit": "Enable dynamic resource allocation and surge planning"
})
opportunities["data_integration"].append({
"opportunity": "Incorporate demographic data",
"description": "Add population demographics and health needs data",
"benefit": "Improve demand forecasting and service planning"
})
# AI application opportunities
opportunities["ai_applications"].append({
"opportunity": "Predictive capacity modeling",
"description": "Use ML to forecast capacity needs based on trends and external factors",
"benefit": "Proactive resource planning and reduced wait times"
})
opportunities["ai_applications"].append({
"opportunity": "Optimization algorithms",
"description": "Implement AI for staff scheduling and resource allocation",
"benefit": "Improved efficiency and reduced operational costs"
})
# Enhanced metrics
opportunities["enhanced_metrics"].append({
"metric": "Patient flow efficiency",
"description": "Measure time from admission to discharge across facilities",
"benefit": "Identify bottlenecks and improve patient experience"
})
opportunities["enhanced_metrics"].append({
"metric": "Resource utilization index",
"description": "Composite metric combining staff, equipment, and space utilization",
"benefit": "Holistic view of operational efficiency"
})
return opportunities
# Helper methods
def _find_column(self, df, patterns):
"""Find the first column matching any pattern"""
for col in df.columns:
if any(pattern.lower() in col.lower() for pattern in patterns):
return col
return None
def _calculate_gini(self, values):
"""Calculate Gini coefficient for inequality measurement"""
values = sorted(values)
n = len(values)
index = np.arange(1, n + 1)
gini = (np.sum((2 * index - n - 1) * values)) / (n * np.sum(values))
return gini
def _calculate_diversity_index(self, distribution):
"""Calculate Shannon diversity index"""
total = sum(distribution.values())
if total == 0:
return 0
proportions = [count/total for count in distribution.values()]
return -sum(p * np.log(p) for p in proportions if p > 0)
def _extract_geographic_scope(self, text):
"""Extract geographic scope from text"""
# Simple keyword-based extraction
if "alberta" in text.lower():
return "Alberta"
elif "canada" in text.lower():
return "Canada"
return "Unknown"
def _extract_time_period(self, text):
"""Extract time period from text"""
# Look for year patterns
import re
years = re.findall(r'\b(20\d{2})\b', text)
if len(years) >= 2:
return f"{min(years)}-{max(years)}"
return "Unknown"
def _extract_facility_types(self, text):
"""Extract facility types from text"""
types = []
if "hospital" in text.lower():
types.append("Hospitals")
if "nursing" in text.lower() or "long-term" in text.lower():
types.append("Nursing homes")
if "clinic" in text.lower():
types.append("Clinics")
return types
def _extract_metrics(self, text):
"""Extract required metrics from text"""
metrics = []
if "bed" in text.lower():
metrics.append("Bed capacity")
if "occupancy" in text.lower():
metrics.append("Occupancy rates")
if "staff" in text.lower():
metrics.append("Staffing levels")
return metrics
def _identify_relevant_data(self, text):
"""Identify relevant datasets for the scenario"""
# Use data registry's find_related_datasets method
keywords = ["facility", "bed", "capacity", "healthcare", "hospital"]
return [item["name"] for item in self.data_registry.find_related_datasets(keywords)]