Spaces:
Sleeping
Sleeping
| # app.py - Complete Dual-Mode Healthcare Analysis System | |
| import os, re, json, traceback, pathlib | |
| from functools import lru_cache | |
| from typing import List, Dict, Any, Tuple, Optional | |
| import pandas as pd | |
| import numpy as np | |
| import gradio as gr | |
| import torch | |
| import regex as re2 | |
| # Import necessary modules | |
| from settings import SNAPSHOT_PATH, PERSIST_CONTENT, HEALTHCARE_SETTINGS, MODEL_SETTINGS | |
| from audit_log import log_event, hash_summary | |
| from privacy import redact_text, safety_filter, refusal_reply | |
| from data_registry import DataRegistry | |
| from upload_ingest import extract_text_from_files | |
| # ---------- Writable caches (HF Spaces-safe) ---------- | |
| HOME = pathlib.Path.home() | |
| HF_HOME = str(HOME / ".cache" / "huggingface") | |
| HF_HUB_CACHE = str(HOME / ".cache" / "huggingface" / "hub") | |
| HF_TRANSFORMERS = str(HOME / ".cache" / "huggingface" / "transformers") | |
| ST_HOME = str(HOME / ".cache" / "sentence-transformers") | |
| GRADIO_TMP = str(HOME / "app" / "gradio") | |
| GRADIO_CACHE = GRADIO_TMP | |
| os.environ.setdefault("HF_HOME", HF_HOME) | |
| os.environ.setdefault("HF_HUB_CACHE", HF_HUB_CACHE) | |
| os.environ.setdefault("TRANSFORMERS_CACHE", HF_TRANSFORMERS) | |
| os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", ST_HOME) | |
| os.environ.setdefault("GRADIO_TEMP_DIR", GRADIO_TMP) | |
| os.environ.setdefault("GRADIO_CACHE_DIR", GRADIO_CACHE) | |
| os.environ.setdefault("HF_HUB_ENABLE_XET", "0") | |
| os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") | |
| for p in [HF_HOME, HF_HUB_CACHE, HF_TRANSFORMERS, ST_HOME, GRADIO_TMP, GRADIO_CACHE]: | |
| try: | |
| os.makedirs(p, exist_ok=True) | |
| except Exception: | |
| pass | |
| # Optional Cohere | |
| try: | |
| import cohere | |
| _HAS_COHERE = True | |
| except Exception: | |
| _HAS_COHERE = False | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from huggingface_hub import login | |
| # ---------- Config ---------- | |
| MODEL_ID = os.getenv("MODEL_ID", "microsoft/Phi-3-mini-4k-instruct") | |
| HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN") | |
| COHERE_API_KEY = os.getenv("COHERE_API_KEY") | |
| USE_HOSTED_COHERE = bool(COHERE_API_KEY and _HAS_COHERE) | |
| MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", MODEL_SETTINGS.get("max_new_tokens", 2048))) | |
| # ---------- Generic System Prompt ---------- | |
| SYSTEM_MASTER = """ | |
| SYSTEM ROLE | |
| You are an AI analytical system that provides data-driven insights for any scenario. | |
| Absolute rules: | |
| - Use ONLY information provided in this conversation (scenario text + uploaded files + user answers). | |
| - Never invent data. If something required is missing after clarifications, write the literal token: INSUFFICIENT_DATA. | |
| - Provide clear analysis with calculations, evidence, and reasoning. | |
| - Maintain privacy safeguards (aggregate data; suppress small cohorts <10). | |
| - Adapt your analysis approach to the specific scenario and data provided. | |
| Formatting rules for structured analysis: | |
| - Start with the header: "Structured Analysis" | |
| - Organize analysis into logical sections based on the scenario requirements | |
| - End with concrete recommendations and a brief "Provenance" mapping outputs to scenario text, uploaded files, and answers. | |
| """.strip() | |
| # ---------- Helper Functions ---------- | |
| def find_column(df, patterns): | |
| """Find the first column in df that matches any of the patterns.""" | |
| if df is None or df.empty: | |
| return None | |
| for col in df.columns: | |
| if any(pattern.lower() in col.lower() for pattern in patterns): | |
| return col | |
| return None | |
| def extract_scenario_tasks(scenario_text): | |
| """Extract specific tasks from scenario text.""" | |
| tasks = [] | |
| lines = scenario_text.split('\n') | |
| in_tasks = False | |
| for line in lines: | |
| line = line.strip() | |
| if line.lower().startswith('tasks'): | |
| in_tasks = True | |
| continue | |
| if in_tasks: | |
| if line.lower().startswith('operational recommendations') or line.lower().startswith('future integration'): | |
| in_tasks = False | |
| continue | |
| if line and (line.startswith(('1.', '2.', '3.', '4.', '5.')) or line.startswith(('•', '-', '*'))): | |
| tasks.append(line) | |
| return tasks | |
| # ---------- Session RAG Class ---------- | |
| class SessionRAG: | |
| def __init__(self): | |
| self.docs = [] | |
| self.artifacts = [] | |
| self.csv_columns = [] | |
| def add_docs(self, chunks): | |
| self.docs.extend(chunks) | |
| def register_artifacts(self, artifacts): | |
| self.artifacts.extend(artifacts) | |
| def get_latest_csv_columns(self): | |
| return self.csv_columns | |
| def retrieve(self, query, k=5): | |
| return self.docs[:k] if self.docs else [] | |
| def clear(self): | |
| self.docs.clear() | |
| self.artifacts.clear() | |
| self.csv_columns.clear() | |
| # ---------- Healthcare-specific functions ---------- | |
| def is_healthcare_scenario(text: str, uploaded_files_paths) -> bool: | |
| """Detect if this is a healthcare scenario with specific indicators.""" | |
| t = (text or "").lower() | |
| # Check for healthcare keywords | |
| has_healthcare_keywords = any(keyword in t for keyword in HEALTHCARE_SETTINGS["healthcare_keywords"]) | |
| # Check for healthcare facility types | |
| has_facility_types = ( | |
| any(ftype in t for ftype in ["hospital", "medical center", "health centre"]) or | |
| any(ftype in t for ftype in ["nursing", "residential", "care facility", "long-term care"]) or | |
| any(ftype in t for ftype in ["ambulatory", "clinic", "surgery center", "outpatient"]) | |
| ) | |
| # Check for healthcare-specific tasks | |
| has_healthcare_tasks = any( | |
| phrase in t for phrase in [ | |
| "bed capacity", "occupancy rates", "facility distribution", | |
| "long-term care", "health operations", "resource allocation" | |
| ] | |
| ) | |
| # Check for healthcare data files | |
| has_healthcare_files = any( | |
| "health" in path.lower() or "facility" in path.lower() or "bed" in path.lower() | |
| for path in uploaded_files_paths | |
| ) | |
| # Check for structured scenario format | |
| has_scenario_structure = any( | |
| section in t for section in ["background", "situation", "tasks"] | |
| ) | |
| return (has_healthcare_keywords or has_facility_types or has_healthcare_tasks) and \ | |
| (has_healthcare_files or has_scenario_structure) | |
| def is_general_conversation(text: str, uploaded_files_paths) -> bool: | |
| """Determine if this is a general conversation rather than a scenario analysis.""" | |
| # If there are uploaded files, it's likely a scenario | |
| if uploaded_files_paths: | |
| return False | |
| # Check for scenario indicators | |
| scenario_indicators = [ | |
| "scenario", "analyze", "analysis", "assess", "evaluate", "recommend", | |
| "tasks", "background", "situation", "dataset", "data" | |
| ] | |
| # If no scenario indicators, it's likely general conversation | |
| text_lower = text.lower() | |
| return not any(indicator in text_lower for indicator in scenario_indicators) | |
| def process_healthcare_data(uploaded_files_paths, data_registry): | |
| """Process healthcare data files with robust error handling.""" | |
| for file_path in uploaded_files_paths: | |
| try: | |
| if data_registry.add_path(file_path): | |
| print(f"Successfully processed: {file_path}") | |
| else: | |
| print(f"Failed to process: {file_path}") | |
| except Exception as e: | |
| print(f"Error processing {file_path}: {e}") | |
| log_event("data_processing_error", None, { | |
| "file": file_path, | |
| "error": str(e) | |
| }) | |
| def analyze_facility_distribution(facilities_df): | |
| """Analyze healthcare facility distribution dynamically.""" | |
| try: | |
| # Validate input | |
| if facilities_df is None or facilities_df.empty: | |
| return {"error": "No facility data provided"} | |
| # Find province column | |
| province_col = find_column(facilities_df, ['province', 'state', 'territory']) | |
| if province_col: | |
| alberta_mask = facilities_df[province_col].str.lower().isin(['alberta', 'ab']) | |
| ab_facilities = facilities_df[alberta_mask].copy() | |
| else: | |
| ab_facilities = facilities_df.copy() | |
| # Find facility type column | |
| type_col = find_column(facilities_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type']) | |
| if not type_col: | |
| return {"error": "Facility type column not found"} | |
| # Facility type frequency | |
| type_counts = ab_facilities[type_col].value_counts().to_dict() | |
| # Find city column | |
| city_col = find_column(facilities_df, ['city', 'municipality', 'town']) | |
| if city_col: | |
| city_counts = ab_facilities[city_col].value_counts().head(5) | |
| top_cities = city_counts.index.tolist() | |
| # Breakdown by facility type for top cities | |
| city_breakdown = {} | |
| for city in top_cities: | |
| city_data = ab_facilities[ab_facilities[city_col] == city] | |
| city_breakdown[city] = city_data[type_col].value_counts().to_dict() | |
| else: | |
| top_cities = [] | |
| city_breakdown = {} | |
| return { | |
| "total_facilities": len(ab_facilities), | |
| "type_distribution": type_counts, | |
| "top_cities": top_cities, | |
| "city_breakdown": city_breakdown, | |
| "columns_used": { | |
| "facility_type": type_col, | |
| "city": city_col, | |
| "province": province_col | |
| } | |
| } | |
| except Exception as e: | |
| log_event("facility_analysis_error", None, {"error": str(e)}) | |
| return {"error": str(e)} | |
| def analyze_bed_capacity(beds_df): | |
| """Analyze bed capacity dynamically.""" | |
| try: | |
| # Validate input | |
| if beds_df is None or beds_df.empty: | |
| return {"error": "No bed data provided"} | |
| # Find required columns | |
| current_col = find_column(beds_df, ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity']) | |
| prev_col = find_column(beds_df, ['prev', 'previous', '2022', 'beds_prev', 'previous_beds']) | |
| if not current_col or not prev_col: | |
| return {"error": f"Missing required columns. Found current: {current_col}, prev: {prev_col}"} | |
| # Ensure derived columns exist | |
| if 'bed_change' not in beds_df.columns: | |
| beds_df['bed_change'] = beds_df[current_col] - beds_df[prev_col] | |
| if 'percent_change' not in beds_df.columns: | |
| beds_df['percent_change'] = beds_df.apply( | |
| lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0, | |
| axis=1 | |
| ) | |
| # Filter to Alberta if province column exists | |
| province_col = find_column(beds_df, ['province', 'state', 'territory']) | |
| if province_col: | |
| alberta_mask = beds_df[province_col].str.lower().isin(['alberta', 'ab']) | |
| ab_beds = beds_df[alberta_mask].copy() | |
| else: | |
| ab_beds = beds_df.copy() | |
| # Calculate zone-level summaries if zone column exists | |
| zone_col = find_column(beds_df, ['zone', 'region', 'area', 'district']) | |
| if zone_col: | |
| zone_summary = ab_beds.groupby(zone_col).agg({ | |
| current_col: 'sum', | |
| prev_col: 'sum', | |
| 'bed_change': 'sum' | |
| }).reset_index() | |
| zone_summary['percent_change'] = zone_summary.apply( | |
| lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0, | |
| axis=1 | |
| ) | |
| # Find zones with largest changes | |
| if len(zone_summary) > 0: | |
| max_abs_decrease_idx = zone_summary['bed_change'].idxmin() | |
| max_pct_decrease_idx = zone_summary['percent_change'].idxmin() | |
| max_abs_decrease = zone_summary.loc[max_abs_decrease_idx] | |
| max_pct_decrease = zone_summary.loc[max_pct_decrease_idx] | |
| else: | |
| max_abs_decrease = {} | |
| max_pct_decrease = {} | |
| # Identify facilities with largest declines | |
| facilities_decline = ab_beds.sort_values('bed_change').head(5) | |
| else: | |
| zone_summary = pd.DataFrame() | |
| max_abs_decrease = {} | |
| max_pct_decrease = {} | |
| facilities_decline = pd.DataFrame() | |
| return { | |
| "zone_summary": zone_summary.to_dict('records') if not zone_summary.empty else [], | |
| "max_absolute_decrease": max_abs_decrease.to_dict() if isinstance(max_abs_decrease, pd.Series) else max_abs_decrease, | |
| "max_percentage_decrease": max_pct_decrease.to_dict() if isinstance(max_pct_decrease, pd.Series) else max_pct_decrease, | |
| "facilities_with_largest_declines": facilities_decline.to_dict('records') if not facilities_decline.empty else [], | |
| "columns_used": { | |
| "beds_current": current_col, | |
| "beds_prev": prev_col, | |
| "zone": zone_col, | |
| "province": province_col | |
| } | |
| } | |
| except Exception as e: | |
| log_event("bed_analysis_error", None, {"error": str(e)}) | |
| return {"error": str(e)} | |
| def assess_long_term_capacity(facilities_df, beds_df, zone_name): | |
| """Assess long-term care capacity dynamically.""" | |
| try: | |
| # Validate inputs | |
| if facilities_df is None or facilities_df.empty: | |
| return {"error": "No facility data provided"} | |
| # Find relevant columns | |
| zone_col = find_column(facilities_df, ['zone', 'region', 'area', 'district']) | |
| city_col = find_column(facilities_df, ['city', 'municipality', 'town']) | |
| type_col = find_column(facilities_df, ['type', 'category', 'class', 'facility_type', 'odhf_facility_type']) | |
| if not type_col: | |
| return {"error": "Facility type column not found"} | |
| # Get facilities in the specified zone | |
| if zone_col: | |
| zone_facilities = facilities_df[facilities_df[zone_col] == zone_name].copy() | |
| else: | |
| # If zone column not available, use province | |
| province_col = find_column(facilities_df, ['province', 'state', 'territory']) | |
| if province_col: | |
| alberta_mask = facilities_df[province_col].str.lower().isin(['alberta', 'ab']) | |
| zone_facilities = facilities_df[alberta_mask].copy() | |
| else: | |
| zone_facilities = facilities_df.copy() | |
| # Find major city in zone | |
| if city_col: | |
| city_counts = zone_facilities[city_col].value_counts() | |
| if len(city_counts) > 0: | |
| major_city = city_counts.index[0] | |
| city_facilities = zone_facilities[zone_facilities[city_col] == major_city] | |
| # Count facility types | |
| facility_counts = city_facilities[type_col].value_counts().to_dict() | |
| # Calculate ratio of nursing/residential to hospitals | |
| hospitals = sum(count for key, count in facility_counts.items() if 'hospital' in key.lower()) | |
| nursing = sum(count for key, count in facility_counts.items() if any(word in key.lower() for word in ['nursing', 'residential', 'care'])) | |
| ratio = nursing / hospitals if hospitals > 0 else 0 | |
| # Assess capacity | |
| capacity_assessment = "sufficient" if ratio >= 1.5 else "insufficient" | |
| return { | |
| "zone": zone_name, | |
| "major_city": major_city, | |
| "facility_counts": facility_counts, | |
| "nursing_to_hospital_ratio": ratio, | |
| "capacity_assessment": capacity_assessment, | |
| "columns_used": { | |
| "zone": zone_col, | |
| "city": city_col, | |
| "facility_type": type_col | |
| } | |
| } | |
| return {"error": "Could not determine major city or facility counts"} | |
| except Exception as e: | |
| log_event("ltc_assessment_error", None, {"error": str(e)}) | |
| return {"error": str(e)} | |
| def generate_operational_recommendations(analysis_results): | |
| """Generate data-driven operational recommendations.""" | |
| recommendations = [] | |
| # Recommendation 1: Address bed capacity issues | |
| if 'bed_capacity' in analysis_results: | |
| bed_data = analysis_results['bed_capacity'] | |
| if 'max_percentage_decrease' in bed_data and isinstance(bed_data['max_percentage_decrease'], dict): | |
| zone_col = bed_data.get('columns_used', {}).get('zone') | |
| zone = bed_data['max_percentage_decrease'].get(zone_col, '') if zone_col else '' | |
| decrease = bed_data['max_percentage_decrease'].get('percent_change', 0) | |
| if zone and decrease: | |
| recommendations.append({ | |
| "title": f"Restore staffed beds in {zone} Zone", | |
| "description": f"Priority should be given to reopening closed units and hiring staff to address the {decrease:.1f}% decrease in bed capacity.", | |
| "data_source": "Bed capacity analysis" | |
| }) | |
| # Recommendation 2: Expand long-term care capacity | |
| if 'long_term_care' in analysis_results: | |
| ltc_data = analysis_results['long_term_care'] | |
| if ltc_data.get('capacity_assessment') == 'insufficient': | |
| city = ltc_data.get('major_city', '') | |
| if city: | |
| recommendations.append({ | |
| "title": f"Expand long-term care capacity in {city}", | |
| "description": f"Invest in new long-term care beds or repurpose existing sites to expedite discharge of stabilized patients.", | |
| "data_source": "Long-term care capacity assessment" | |
| }) | |
| # Recommendation 3: Implement surge plans | |
| if 'bed_capacity' in analysis_results: | |
| recommendations.append({ | |
| "title": "Implement surge capacity plans", | |
| "description": "Develop modular units and activate staffing pools to handle unpredictable spikes in demand.", | |
| "data_source": "Bed capacity trends" | |
| }) | |
| return recommendations | |
| def generate_ai_integration_discussion(analysis_results): | |
| """Generate discussion on future AI integration for healthcare operations.""" | |
| return { | |
| "title": "Future Integration for Augmented Decision-Making", | |
| "description": "Combining facility information with operational data like emergency department wait times and disease surveillance can enable AI-driven resource optimization.", | |
| "example": "A model could ingest current ED wait times, hospital occupancy, and community case counts to forecast bed demand by zone and recommend redirecting ambulances to facilities with spare capacity.", | |
| "metrics": ["Hospital occupancy rates", "ED wait times", "Disease surveillance data"] | |
| } | |
| def format_healthcare_analysis_response(scenario_text, results, recommendations, ai_integration): | |
| """Format the healthcare analysis response with tables and sections.""" | |
| response = "# Structured Analysis: Healthcare Scenario\n\n" | |
| # Extract tasks from scenario to ensure we address all requirements | |
| tasks = extract_scenario_tasks(scenario_text) | |
| # Data Preparation Section | |
| if 'facility_distribution' in results: | |
| fd = results['facility_distribution'] | |
| if 'error' in fd: | |
| response += "## 1. Data Preparation\n\n" | |
| response += f"Error in facility distribution analysis: {fd['error']}\n\n" | |
| else: | |
| response += "## 1. Data Preparation\n\n" | |
| response += f"Total healthcare facilities: {fd.get('total_facilities', 'N/A')}\n\n" | |
| if 'type_distribution' in fd and isinstance(fd['type_distribution'], dict): | |
| response += "### Facility Type Distribution\n\n" | |
| for ftype, count in fd['type_distribution'].items(): | |
| response += f"- {ftype}: {count}\n" | |
| response += "\n" | |
| if 'city_breakdown' in fd and isinstance(fd['city_breakdown'], dict): | |
| response += "### Top Cities by Facility Count\n\n" | |
| response += "| City | Hospitals | Nursing/Residential | Ambulatory | Total |\n" | |
| response += "|------|-----------|-------------------|------------|-------|\n" | |
| for city, breakdown in fd['city_breakdown'].items(): | |
| hospitals = breakdown.get('Hospitals', 0) | |
| nursing = breakdown.get('Nursing and residential care facilities', 0) | |
| ambulatory = breakdown.get('Ambulatory health care services', 0) | |
| total = hospitals + nursing + ambulatory | |
| response += f"| {city} | {hospitals} | {nursing} | {ambulatory} | {total} |\n" | |
| response += "\n" | |
| # Bed Capacity Analysis Section | |
| if 'bed_capacity' in results: | |
| bc = results['bed_capacity'] | |
| if 'error' in bc: | |
| response += "## 2. Bed Capacity Analysis\n\n" | |
| response += f"Error in bed capacity analysis: {bc['error']}\n\n" | |
| else: | |
| response += "## 2. Bed Capacity Analysis\n\n" | |
| if 'zone_summary' in bc and bc['zone_summary']: | |
| response += "### Bed Capacity by Zone\n\n" | |
| response += "| Zone | Beds (Current) | Beds (Previous) | Absolute Change | Percent Change |\n" | |
| response += "|------|---------------|-----------------|-----------------|----------------|\n" | |
| zone_col = bc.get('columns_used', {}).get('zone') | |
| current_col = bc.get('columns_used', {}).get('beds_current') | |
| prev_col = bc.get('columns_used', {}).get('beds_prev') | |
| for zone_data in bc['zone_summary']: | |
| zone = zone_data.get(zone_col, 'N/A') if zone_col else 'N/A' | |
| current = zone_data.get(current_col, 'N/A') if current_col else 'N/A' | |
| prev = zone_data.get(prev_col, 'N/A') if prev_col else 'N/A' | |
| change = zone_data.get('bed_change', 'N/A') | |
| pct = zone_data.get('percent_change', 'N/A') | |
| response += f"| {zone} | {current} | {prev} | {change} | {pct:.1f}% |\n" | |
| response += "\n" | |
| if 'max_absolute_decrease' in bc and isinstance(bc['max_absolute_decrease'], dict) and \ | |
| 'max_percentage_decrease' in bc and isinstance(bc['max_percentage_decrease'], dict): | |
| zone_col = bc.get('columns_used', {}).get('zone') | |
| abs_dec = bc['max_absolute_decrease'] | |
| pct_dec = bc['max_percentage_decrease'] | |
| response += f"**Zone with largest absolute decrease**: {abs_dec.get(zone_col, 'N/A') if zone_col else 'N/A'} ({abs_dec.get('bed_change', 'N/A')} beds)\n\n" | |
| response += f"**Zone with largest percentage decrease**: {pct_dec.get(zone_col, 'N/A') if zone_col else 'N/A'} ({pct_dec.get('percent_change', 'N/A'):.1f}%)\n\n" | |
| if 'facilities_with_largest_declines' in bc and bc['facilities_with_largest_declines']: | |
| response += "### Facilities with Largest Bed Declines\n\n" | |
| response += "| Facility | Zone | Teaching Status | Beds Lost |\n" | |
| response += "|----------|------|----------------|-----------|\n" | |
| zone_col = bc.get('columns_used', {}).get('zone') | |
| for facility in bc['facilities_with_largest_declines']: | |
| name = facility.get('facility_name', 'N/A') | |
| zone = facility.get(zone_col, 'N/A') if zone_col else 'N/A' | |
| teaching = facility.get('teaching_status', 'N/A') | |
| change = facility.get('bed_change', 'N/A') | |
| response += f"| {name} | {zone} | {teaching} | {change} |\n" | |
| response += "\n" | |
| # Long-term Care Section | |
| if 'long_term_care' in results: | |
| ltc = results['long_term_care'] | |
| if 'error' in ltc: | |
| response += "## 3. Long-Term Care Capacity Assessment\n\n" | |
| response += f"Error in long-term care assessment: {ltc['error']}\n\n" | |
| else: | |
| response += "## 3. Long-Term Care Capacity Assessment\n\n" | |
| zone = ltc.get('zone', 'N/A') | |
| city = ltc.get('major_city', 'N/A') | |
| ratio = ltc.get('nursing_to_hospital_ratio', 0) | |
| assessment = ltc.get('capacity_assessment', 'N/A') | |
| response += f"In {zone} Zone, the major city is {city} with a nursing/residential to hospital ratio of {ratio:.2f}.\n\n" | |
| response += f"Long-term care capacity appears **{assessment}** in {city}.\n\n" | |
| if 'facility_counts' in ltc and isinstance(ltc['facility_counts'], dict): | |
| response += "### Facility Counts\n\n" | |
| for ftype, count in ltc['facility_counts'].items(): | |
| response += f"- {ftype}: {count}\n" | |
| response += "\n" | |
| # Operational Recommendations Section | |
| response += "## 4. Operational Recommendations\n\n" | |
| if recommendations: | |
| for rec in recommendations: | |
| response += f"### {rec['title']}\n\n" | |
| response += f"{rec['description']}\n\n" | |
| response += f"*Data source: {rec['data_source']}*\n\n" | |
| else: | |
| response += "No specific recommendations could be generated due to data limitations.\n\n" | |
| # AI Integration Section | |
| response += "## 5. Future Integration for Augmented AI\n\n" | |
| response += f"### {ai_integration['title']}\n\n" | |
| response += f"{ai_integration['description']}\n\n" | |
| response += f"**Example**: {ai_integration['example']}\n\n" | |
| response += "**Key metrics to incorporate**:\n" | |
| for metric in ai_integration['metrics']: | |
| response += f"- {metric}\n" | |
| response += "\n" | |
| # Provenance Section | |
| response += "## Provenance\n\n" | |
| response += "This analysis is based on:\n" | |
| response += "- Scenario description provided by the user\n" | |
| response += "- Uploaded data files\n" | |
| response += "- Calculations performed on the provided data\n" | |
| return response | |
| def handle_healthcare_scenario(scenario_text, data_registry, history): | |
| """Handle healthcare scenarios dynamically with explicit task following.""" | |
| try: | |
| results = {} | |
| # Extract tasks from scenario to ensure we address all requirements | |
| tasks = extract_scenario_tasks(scenario_text) | |
| print(f"Extracted tasks: {tasks}") | |
| # Dynamically identify relevant files | |
| facility_files = data_registry.get_data_by_type('facility_data') | |
| bed_files = data_registry.get_data_by_type('bed_data') | |
| # Use the first file of each type (can be enhanced to use multiple) | |
| facilities_df = None | |
| if facility_files: | |
| facilities_df = data_registry.get(facility_files[0]) | |
| beds_df = None | |
| if bed_files: | |
| beds_df = data_registry.get(bed_files[0]) | |
| # Log what we found | |
| log_event("data_files_found", None, { | |
| "facilities": facilities_df is not None, | |
| "beds": beds_df is not None, | |
| "facility_files": facility_files, | |
| "bed_files": bed_files | |
| }) | |
| # Task 1: Data preparation (facility distribution) | |
| if facilities_df is not None: | |
| results['facility_distribution'] = analyze_facility_distribution(facilities_df) | |
| # Task 2: Bed capacity analysis | |
| if beds_df is not None: | |
| results['bed_capacity'] = analyze_bed_capacity(beds_df) | |
| # Task 3: Long-term care capacity assessment | |
| if 'bed_capacity' in results and 'max_percentage_decrease' in results['bed_capacity']: | |
| zone_col = results['bed_capacity'].get('columns_used', {}).get('zone') | |
| if zone_col: | |
| worst_zone = results['bed_capacity']['max_percentage_decrease'].get(zone_col, '') | |
| if worst_zone and facilities_df is not None: | |
| results['long_term_care'] = assess_long_term_capacity( | |
| facilities_df, | |
| beds_df, | |
| worst_zone | |
| ) | |
| # Generate operational recommendations (Task 4.1) | |
| recommendations = generate_operational_recommendations(results) | |
| # Generate AI integration discussion (Task 4.2) | |
| ai_integration = generate_ai_integration_discussion(results) | |
| # Format response ensuring all tasks are addressed | |
| response = format_healthcare_analysis_response(scenario_text, results, recommendations, ai_integration) | |
| return response | |
| except Exception as e: | |
| log_event("healthcare_scenario_error", None, {"error": str(e)}) | |
| return f"Error analyzing healthcare scenario: {str(e)}" | |
| # ---------- Model loading helpers ---------- | |
| def pick_dtype_and_map(): | |
| if torch.cuda.is_available(): | |
| return torch.float16, "auto" | |
| if torch.backends.mps.is_available(): | |
| return torch.float16, {"": "mps"} | |
| return torch.float32, "cpu" | |
| def load_local_model(): | |
| if not HF_TOKEN: | |
| raise RuntimeError("HUGGINGFACE_HUB_TOKEN is not set.") | |
| login(token=HF_TOKEN, add_to_git_credential=False) | |
| dtype, device_map = pick_dtype_and_map() | |
| tok = AutoTokenizer.from_pretrained( | |
| MODEL_ID, token=HF_TOKEN, use_fast=True, model_max_length=8192, | |
| padding_side="left", trust_remote_code=True, | |
| cache_dir=os.environ.get("TRANSFORMERS_CACHE") | |
| ) | |
| try: | |
| mdl = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, token=HF_TOKEN, device_map=device_map, | |
| low_cpu_mem_usage=True, torch_dtype=dtype, trust_remote_code=True, | |
| cache_dir=os.environ.get("TRANSFORMERS_CACHE") | |
| ) | |
| except Exception: | |
| mdl = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, token=HF_TOKEN, | |
| low_cpu_mem_usage=True, torch_dtype=dtype, trust_remote_code=True, | |
| cache_dir=os.environ.get("TRANSFORMERS_CACHE") | |
| ) | |
| mdl.to("cuda" if torch.cuda.is_available() else "cpu") | |
| if mdl.config.eos_token_id is None and tok.eos_token_id is not None: | |
| mdl.config.eos_token_id = tok.eos_token_id | |
| return mdl, tok | |
| # ---------- Chat helpers ---------- | |
| def is_identity_query(message, history): | |
| patterns = [ | |
| r"\bwho\s+are\s+you\b", r"\bwhat\s+are\s+you\b", r"\bwhat\s+is\s+your\s+name\b", | |
| r"\bwho\s+is\s+this\b", r"\bidentify\s+yourself\b", r"\btell\s+me\s+about\s+yourself\b", | |
| r"\bdescribe\s+yourself\b", r"\band\s+you\s*\?\b", r"\byour\s+name\b", | |
| r"\bwho\s+am\s+i\s+chatting\s+with\b", | |
| ] | |
| def match(t): return any(re.search(p, (t or "").strip().lower()) for p in patterns) | |
| if match(message): return True | |
| if history: | |
| last_user = history[-1][0] if isinstance(history[-1], (list, tuple)) else None | |
| if match(last_user): return True | |
| return False | |
| def _iter_user_assistant(history): | |
| for item in (history or []): | |
| if isinstance(item, (list, tuple)): | |
| u = item[0] if len(item) > 0 else "" | |
| a = item[1] if len(item) > 1 else "" | |
| yield u, a | |
| def _sanitize_text(s: str) -> str: | |
| if not isinstance(s, str): | |
| return s | |
| return re2.sub(r'[\p{C}--[\n\t]]+', '', s) | |
| def cohere_chat(message, history): | |
| if not USE_HOSTED_COHERE: | |
| return None | |
| try: | |
| client = cohere.Client(api_key=COHERE_API_KEY) | |
| parts = [] | |
| for u, a in _iter_user_assistant(history): | |
| if u: parts.append(f"User: {u}") | |
| if a: parts.append(f"Assistant: {a}") | |
| parts.append(f"User: {message}") | |
| prompt = "\n".join(parts) + "\nAssistant:" | |
| resp = client.chat( | |
| model="command-r7b-12-2024", | |
| message=prompt, | |
| temperature=MODEL_SETTINGS.get("temperature", 0.3), | |
| max_tokens=MAX_NEW_TOKENS, | |
| ) | |
| if hasattr(resp, "text") and resp.text: return resp.text.strip() | |
| if hasattr(resp, "reply") and resp.reply: return resp.reply.strip() | |
| if hasattr(resp, "generations") and resp.generations: return resp.generations[0].text.strip() | |
| return None | |
| except Exception: | |
| return None | |
| def build_inputs(tokenizer, message, history): | |
| msgs = [{"role": "system", "content": SYSTEM_MASTER}] | |
| for u, a in _iter_user_assistant(history): | |
| if u: msgs.append({"role": "user", "content": u}) | |
| if a: msgs.append({"role": "assistant", "content": a}) | |
| msgs.append({"role": "user", "content": message}) | |
| return tokenizer.apply_chat_template( | |
| msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt" | |
| ) | |
| def local_generate(model, tokenizer, input_ids, max_new_tokens=MAX_NEW_TOKENS): | |
| input_ids = input_ids.to(model.device) | |
| with torch.no_grad(): | |
| out = model.generate( | |
| input_ids=input_ids, max_new_tokens=max_new_tokens, | |
| do_sample=True, temperature=MODEL_SETTINGS.get("temperature", 0.3), | |
| top_p=MODEL_SETTINGS.get("top_p", 0.9), | |
| repetition_penalty=MODEL_SETTINGS.get("repetition_penalty", 1.15), | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| gen_only = out[0, input_ids.shape[-1]:] | |
| return tokenizer.decode(gen_only, skip_special_tokens=True).strip() | |
| # ---------- Core chat logic ---------- | |
| def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answers=False): | |
| try: | |
| log_event("user_message", None, {"sizes": {"chars": len(user_msg or "")}}) | |
| safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input") | |
| if blocked_in: | |
| ans = refusal_reply(reason_in) | |
| return history + [(user_msg, ans)], awaiting_answers | |
| if is_identity_query(safe_in, history): | |
| ans = "I am an AI analytical system designed to help with both general conversations and healthcare scenario analysis. I can answer your questions and also analyze healthcare data when you upload files and describe a scenario." | |
| return history + [(user_msg, ans)], awaiting_answers | |
| # Initialize data registry and session RAG | |
| data_registry = DataRegistry() | |
| session_rag = SessionRAG() | |
| # Process uploaded files if any | |
| if uploaded_files_paths: | |
| process_healthcare_data(uploaded_files_paths, data_registry) | |
| # Also extract text for RAG | |
| ing = extract_text_from_files(uploaded_files_paths) | |
| if ing.get("chunks"): | |
| session_rag.add_docs(ing["chunks"]) | |
| if ing.get("artifacts"): | |
| session_rag.register_artifacts(ing["artifacts"]) | |
| # Update session RAG with CSV columns | |
| for file_name in data_registry.names(): | |
| if file_name.endswith('.csv'): | |
| df = data_registry.get(file_name) | |
| session_rag.csv_columns = list(df.columns) | |
| # Determine the mode: healthcare scenario or general conversation | |
| if is_healthcare_scenario(safe_in, uploaded_files_paths): | |
| # Healthcare scenario mode | |
| response = handle_healthcare_scenario(safe_in, data_registry, history) | |
| return history + [(user_msg, response)], False | |
| else: | |
| # General conversation mode | |
| # Try Cohere first if available | |
| if USE_HOSTED_COHERE: | |
| out = cohere_chat(safe_in, history) | |
| if out: | |
| out = _sanitize_text(out) | |
| safe_out, blocked_out, reason_out = safety_filter(out, mode="output") | |
| if blocked_out: | |
| safe_out = refusal_reply(reason_out) | |
| log_event("assistant_reply", None, { | |
| **hash_summary("prompt", safe_in if not PERSIST_CONTENT else ""), | |
| **hash_summary("reply", safe_out if not PERSIST_CONTENT else ""), | |
| "mode": "general_cohere", | |
| }) | |
| return history + [(user_msg, safe_out)], False | |
| # Fall back to local model | |
| try: | |
| model, tokenizer = load_local_model() | |
| inputs = build_inputs(tokenizer, safe_in, history) | |
| out = local_generate(model, tokenizer, inputs, max_new_tokens=MAX_NEW_TOKENS) | |
| if isinstance(out, str): | |
| for tag in ("Assistant:", "System:", "User:"): | |
| if out.startswith(tag): | |
| out = out[len(tag):].strip() | |
| out = _sanitize_text(out or "") | |
| safe_out, blocked_out, reason_out = safety_filter(out, mode="output") | |
| if blocked_out: | |
| safe_out = refusal_reply(reason_out) | |
| log_event("assistant_reply", None, { | |
| **hash_summary("prompt", safe_in if not PERSIST_CONTENT else ""), | |
| **hash_summary("reply", safe_out if not PERSIST_CONTENT else ""), | |
| "mode": "general_local", | |
| }) | |
| return history + [(user_msg, safe_out)], False | |
| except Exception as e: | |
| err = f"Error generating response: {str(e)}" | |
| log_event("model_error", None, {"error": str(e)}) | |
| return history + [(user_msg, err)], False | |
| except Exception as e: | |
| err = f"Error: {e}" | |
| try: | |
| traceback.print_exc() | |
| except Exception: | |
| pass | |
| return history + [(user_msg, err)], awaiting_answers | |
| # ---------- UI Setup ---------- | |
| theme = gr.themes.Soft(primary_hue="teal", neutral_hue="slate", radius_size=gr.themes.sizes.radius_lg) | |
| custom_css = """ | |
| :root { --brand-bg: #0f172a; --brand-accent: #0d9488; --brand-text: #0f172a; --brand-text-light: #ffffff; } | |
| html, body, .gradio-container { height: 100vh; } | |
| .gradio-container { background: var(--brand-bg); display: flex; flex-direction: column; } | |
| /* HERO (landing) */ | |
| #hero-wrap { height: 70vh; display: grid; place-items: center; } | |
| #hero { text-align: center; } | |
| #hero h2 { color: #0f172a; font-weight: 800; font-size: 32px; margin-bottom: 22px; } | |
| #hero .search-row { width: min(860px, 92vw); margin: 0 auto; display: flex; gap: 8px; align-items: stretch; } | |
| #hero .search-row .hero-box { flex: 1 1 auto; } | |
| #hero .search-row .hero-box textarea { height: 52px !important; } | |
| #hero-send > button { height: 52px !important; padding: 0 18px !important; border-radius: 12px !important; } | |
| #hero .hint { color: #334155; margin-top: 10px; font-size: 13px; opacity: 0.9; } | |
| /* CHAT */ | |
| #chat-container { position: relative; } | |
| .chatbot header, .chatbot .label, .chatbot .label-wrap { display: none !important; } | |
| .message.user, .message.bot { background: var(--brand-accent) !important; color: var(--brand-text-light) !important; border-radius: 12px !important; padding: 8px 12px !important; } | |
| textarea, input, .gr-input { border-radius: 12px !important; } | |
| /* Chat input row equal heights */ | |
| #chat-input-row { align-items: stretch; } | |
| #chat-msg textarea { height: 52px !important; } | |
| #chat-send > button, #chat-clear > button { height: 52px !important; padding: 0 18px !important; border-radius: 12px !important; } | |
| """ | |
| # ---------- Main App ---------- | |
| with gr.Blocks(theme=theme, css=custom_css, analytics_enabled=False) as demo: | |
| # --- HERO (initial screen) --- | |
| with gr.Column(elem_id="hero-wrap", visible=True) as hero_wrap: | |
| with gr.Column(elem_id="hero"): | |
| gr.HTML("<h2>How can I help you today?</h2>") | |
| with gr.Row(elem_classes="search-row"): | |
| hero_msg = gr.Textbox( | |
| placeholder="Ask me anything or upload healthcare data files for scenario analysis…", | |
| show_label=False, | |
| lines=1, | |
| elem_classes="hero-box" | |
| ) | |
| hero_send = gr.Button("➤", scale=0, elem_id="hero-send") | |
| gr.Markdown('<div class="hint">I can help with general questions or analyze healthcare scenarios when you upload data files and describe your analysis needs.</div>') | |
| # --- MAIN APP (hidden until first message) --- | |
| with gr.Column(elem_id="chat-container", visible=False) as app_wrap: | |
| chat = gr.Chatbot(label="", show_label=False, height="80vh") | |
| with gr.Row(): | |
| uploads = gr.Files( | |
| label="Upload healthcare data files", | |
| file_types=HEALTHCARE_SETTINGS["supported_file_types"], | |
| file_count="multiple", height=68 | |
| ) | |
| with gr.Row(elem_id="chat-input-row"): | |
| msg = gr.Textbox( | |
| label="", | |
| show_label=False, | |
| placeholder="Ask me anything or continue your healthcare scenario analysis…", | |
| scale=10, | |
| elem_id="chat-msg", | |
| lines=1, | |
| ) | |
| send = gr.Button("Send", scale=1, elem_id="chat-send") | |
| clear = gr.Button("Clear chat", scale=1, elem_id="chat-clear") | |
| # ---- State | |
| state_history = gr.State(value=[]) | |
| state_uploaded = gr.State(value=[]) | |
| state_awaiting = gr.State(value=False) | |
| # ---- Uploads | |
| def _store_uploads(files, current): | |
| paths = [] | |
| for f in (files or []): | |
| paths.append(getattr(f, "name", None) or f) | |
| return (current or []) + paths | |
| uploads.change(fn=_store_uploads, inputs=[uploads, state_uploaded], outputs=state_uploaded) | |
| # ---- Core send (used by both hero input and chat input) | |
| def _on_send(user_msg, history, up_paths, awaiting): | |
| try: | |
| if not user_msg or not user_msg.strip(): | |
| return history, "", history, awaiting | |
| new_history, new_awaiting = clarityops_reply( | |
| user_msg.strip(), history or [], None, up_paths or [], awaiting_answers=awaiting | |
| ) | |
| return new_history, "", new_history, new_awaiting | |
| except Exception as e: | |
| err = f"Error: {e}" | |
| try: traceback.print_exc() | |
| except Exception: pass | |
| new_hist = (history or []) + [(user_msg or "", err)] | |
| return new_hist, "", new_hist, awaiting | |
| # ---- Hero -> App transition + first send | |
| def _hero_start(user_msg, history, up_paths, awaiting): | |
| chat_o, msg_o, hist_o, await_o = _on_send(user_msg, history, up_paths, awaiting) | |
| return ( | |
| chat_o, msg_o, hist_o, await_o, | |
| gr.update(visible=False), | |
| gr.update(visible=True), | |
| "" | |
| ) | |
| hero_send.click( | |
| _hero_start, | |
| inputs=[hero_msg, state_history, state_uploaded, state_awaiting], | |
| outputs=[chat, msg, state_history, state_awaiting, hero_wrap, app_wrap, hero_msg], | |
| concurrency_limit=2, queue=True | |
| ) | |
| hero_msg.submit( | |
| _hero_start, | |
| inputs=[hero_msg, state_history, state_uploaded, state_awaiting], | |
| outputs=[chat, msg, state_history, state_awaiting, hero_wrap, app_wrap, hero_msg], | |
| concurrency_limit=2, queue=True | |
| ) | |
| # ---- Normal chat interactions after hero is gone | |
| send.click(_on_send, inputs=[msg, state_history, state_uploaded, state_awaiting], | |
| outputs=[chat, msg, state_history, state_awaiting], | |
| concurrency_limit=2, queue=True) | |
| msg.submit(_on_send, inputs=[msg, state_history, state_uploaded, state_awaiting], | |
| outputs=[chat, msg, state_history, state_awaiting], | |
| concurrency_limit=2, queue=True) | |
| def _on_clear(): | |
| return ( | |
| [], "", [], False, | |
| gr.update(visible=True), | |
| gr.update(visible=False), | |
| "" | |
| ) | |
| clear.click(_on_clear, None, [chat, msg, state_history, state_awaiting, hero_wrap, app_wrap, hero_msg]) | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", "7860")) | |
| demo.launch(server_name="0.0.0.0", server_port=port, show_api=False, max_threads=40) |