# schema_mapper.py from typing import Dict, List, Any, Set import re # Healthcare terminology mappings HEALTHCARE_CONCEPTS = { # Facility types "hospital": ["hospital", "medical center", "health centre", "clinic"], "nursing_facility": ["nursing home", "long-term care", "residential care", "care facility"], "ambulatory_care": ["ambulatory", "outpatient", "clinic", "surgery center"], # Capacity metrics "bed_capacity": ["beds", "capacity", "bed count", "staffed beds"], "occupancy_rate": ["occupancy", "utilization", "bed occupancy"], # Geographic terms "zone": ["zone", "region", "area", "district"], "province": ["province", "state", "territory"], # Time periods "fiscal_year": ["fiscal year", "fy", "financial year"], "current_period": ["current", "2023-24", "present", "latest"], "previous_period": ["previous", "2022-23", "past", "last"], # Healthcare operations "patient_flow": ["patient flow", "throughput", "patient movement"], "resource_allocation": ["resource allocation", "staffing", "resource distribution"], "surge_capacity": ["surge", "overflow", "emergency capacity"], } class MappingResult: def __init__(self): self.resolved = {} # Successfully mapped concepts self.ambiguous = {} # Concepts with multiple possible mappings self.missing = set() # Concepts that couldn't be mapped def map_concepts(scenario_text: str, data_registry) -> MappingResult: """Map healthcare concepts from scenario text to data registry.""" result = MappingResult() # Extract key terms from scenario scenario_lower = scenario_text.lower() # Check for healthcare concepts for concept, synonyms in HEALTHCARE_CONCEPTS.items(): # Check if any synonym appears in the scenario found_synonyms = [syn for syn in synonyms if syn in scenario_lower] if found_synonyms: # Try to map to data registry mapped_to = _map_to_data_registry(concept, data_registry) if mapped_to: result.resolved[concept] = mapped_to else: result.missing.add(concept) # Additional mapping for specific healthcare patterns # Check for facility distribution patterns if any(phrase in scenario_lower for phrase in ["facility distribution", "facility count", "number of facilities"]): if any("facility" in name.lower() for name in data_registry.names()): result.resolved["facility_distribution"] = next( (name for name in data_registry.names() if "facility" in name.lower()), None ) else: result.missing.add("facility_distribution") # Check for bed capacity patterns if any(phrase in scenario_lower for phrase in ["bed capacity", "bed count", "staffed beds"]): if any("bed" in name.lower() for name in data_registry.names()): result.resolved["bed_capacity"] = next( (name for name in data_registry.names() if "bed" in name.lower()), None ) else: result.missing.add("bed_capacity") # Check for long-term care patterns if any(phrase in scenario_lower for phrase in ["long-term care", "ltc", "nursing capacity"]): result.resolved["long_term_care"] = "facility_distribution" # Usually in facility data return result def _map_to_data_registry(concept: str, data_registry) -> Any: """Helper to map a concept to the data registry.""" file_names = data_registry.names() if concept in ["hospital", "facility_distribution", "long_term_care"]: return next((name for name in file_names if "facility" in name.lower() or "health" in name.lower()), None) elif concept == "bed_capacity": return next((name for name in file_names if "bed" in name.lower()), None) elif concept == "zone": # Check if any dataframe has a 'zone' column for name in file_names: df = data_registry.get(name) if df is not None and 'zone' in df.columns: return name return None return None def build_phase1_questions(scenario_text: str, registry, mapping: MappingResult) -> str: """Build clarifying questions based on mapping results.""" questions = [] # If we have good mapping, we might not need questions if len(mapping.resolved) > len(mapping.missing) and len(mapping.ambiguous) == 0: return "**Data Analysis Ready**: Your data appears well-structured. Please provide any additional context about your analysis goals." # Questions for missing concepts if mapping.missing: questions.append("### Missing Information") for concept in mapping.missing: if concept == "facility_distribution": questions.append("- Do you have data about healthcare facilities and their distribution?") elif concept == "bed_capacity": questions.append("- Do you have data about hospital bed capacity and changes over time?") else: questions.append(f"- Can you provide more information about {concept}?") # Questions for ambiguous concepts if mapping.ambiguous: questions.append("### Clarification Needed") for concept, options in mapping.ambiguous.items(): questions.append(f"- For '{concept}', did you mean: {', '.join(options)}?") if not questions: return "**Data Analysis Ready**: Your data appears well-structured. Please provide any additional context about your analysis goals." return "\n".join(questions)