Penny_V2.2

Paused

App Files Files Community

pythonprincess commited on Nov 29, 2025

Commit

b8dfce3

verified ·

1 Parent(s): 46a515b

Upload 2 files

Browse files

Files changed (2) hide show

app/model_loader.py +897 -0
app/orchestrator.py +1410 -0

app/model_loader.py ADDED Viewed

	@@ -0,0 +1,897 @@

+# app/model_loader.py
+"""
+🧠 PENNY Model Loader - Azure-Ready Multi-Model Orchestration
+This is Penny's brain loader. She manages multiple specialized models:
+- Gemma 7B for conversational reasoning
+- NLLB-200 for 27-language translation
+- Sentiment analysis for resident wellbeing
+- Bias detection for equitable service
+- LayoutLM for civic document processing
+MISSION: Load AI models efficiently in memory-constrained environments while
+maintaining Penny's warm, civic-focused personality across all interactions.
+FEATURES:
+- Lazy loading (models only load when needed)
+- 8-bit quantization for memory efficiency
+- GPU/CPU auto-detection
+- Model caching and reuse
+- Graceful fallbacks for Azure ML deployment
+- Memory monitoring and cleanup
+"""
+import json
+import os
+import torch
+from typing import Dict, Any, Callable, Optional, Union, List
+from pathlib import Path
+import logging
+from dataclasses import dataclass
+from enum import Enum
+from datetime import datetime
+# --- LOGGING SETUP (Must be before functions that use it) ---
+logger = logging.getLogger(__name__)
+# ============================================================
+# HUGGING FACE AUTHENTICATION
+# ============================================================
+def setup_huggingface_auth() -> bool:
+    """
+    🔐 Authenticates with Hugging Face Hub using HF_TOKEN or READTOKEN.
+    Returns:
+        True if authentication successful or not needed, False if failed
+    """
+    # Check for HF_TOKEN first, then READTOKEN (for Hugging Face Spaces)
+    HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("READTOKEN")
+    if not HF_TOKEN:
+        logger.warning("⚠️ HF_TOKEN/READTOKEN not found in environment")
+        logger.warning("   Some models may not be accessible")
+        logger.warning("   Set HF_TOKEN or READTOKEN in your environment or Hugging Face Spaces secrets")
+        return False
+    try:
+        from huggingface_hub import login
+        login(token=HF_TOKEN, add_to_git_credential=False)
+        logger.info("✅ Authenticated with Hugging Face Hub")
+        return True
+    except ImportError:
+        logger.warning("⚠️ huggingface_hub not installed, skipping authentication")
+        return False
+    except Exception as e:
+        logger.error(f"❌ Failed to authenticate with Hugging Face: {e}")
+        return False
+# Attempt authentication at module load
+# Note: This runs when the module is imported, so HF_TOKEN must be in environment
+# For Hugging Face Spaces: Set HF_TOKEN as a secret in Space settings
+# For local dev: Add HF_TOKEN to .env file or export it
+_authentication_result = setup_huggingface_auth()
+if _authentication_result:
+    logger.info("🔐 Hugging Face authentication successful - gated models accessible")
+else:
+    logger.warning("⚠️ Hugging Face authentication failed - only public models will work")
+# --- PATH CONFIGURATION (Environment-Aware) ---
+# Support both local development and Azure ML deployment
+if os.getenv("AZUREML_MODEL_DIR"):
+    # Azure ML deployment - models are in AZUREML_MODEL_DIR
+    MODEL_ROOT = Path(os.getenv("AZUREML_MODEL_DIR"))
+    CONFIG_PATH = MODEL_ROOT / "model_config.json"
+    logger.info("☁️ Running in Azure ML environment")
+else:
+    # Local development - models are in project structure
+    PROJECT_ROOT = Path(__file__).parent.parent
+    MODEL_ROOT = PROJECT_ROOT / "models"
+    CONFIG_PATH = MODEL_ROOT / "model_config.json"
+    logger.info("💻 Running in local development environment")
+logger.info(f"📂 Model config path: {CONFIG_PATH}")
+# ============================================================
+# PENNY'S CIVIC IDENTITY & PERSONALITY
+# ============================================================
+PENNY_SYSTEM_PROMPT = (
+    "You are Penny, a smart, civic-focused AI assistant serving local communities. "
+    "You help residents navigate city services, government programs, and community resources. "
+    "You're warm, professional, accurate, and always stay within your civic mission.\n\n"
+    "Your expertise includes:\n"
+    "- Connecting people with local services (food banks, shelters, libraries)\n"
+    "- Translating information into 27 languages\n"
+    "- Explaining public programs and eligibility\n"
+    "- Guiding residents through civic processes\n"
+    "- Providing emergency resources when needed\n\n"
+    "YOUR PERSONALITY:\n"
+    "- Warm and approachable, like a helpful community center staff member\n"
+    "- Clear and practical, avoiding jargon\n"
+    "- Culturally sensitive and inclusive\n"
+    "- Patient with repetition or clarification\n"
+    "- Funny when appropriate, but never at anyone's expense\n\n"
+    "CRITICAL RULES:\n"
+    "- When residents greet you by name (e.g., 'Hi Penny'), respond warmly and personally\n"
+    "- You are ALWAYS Penny - never ChatGPT, Assistant, Claude, or any other name\n"
+    "- If you don't know something, say so clearly and help find the right resource\n"
+    "- NEVER make up information about services, eligibility, or contacts\n"
+    "- Stay within your civic mission - you don't provide legal, medical, or financial advice\n"
+    "- For emergencies, immediately connect to appropriate services (911, crisis lines)\n\n"
+)
+# --- GLOBAL STATE ---
+_MODEL_CACHE: Dict[str, Any] = {}  # Memory-efficient model reuse
+_LOAD_TIMES: Dict[str, float] = {}  # Track model loading performance
+# ============================================================
+# DEVICE MANAGEMENT
+# ============================================================
+class DeviceType(str, Enum):
+    """Supported compute devices."""
+    CUDA = "cuda"
+    CPU = "cpu"
+    MPS = "mps"  # Apple Silicon
+def get_optimal_device() -> str:
+    """
+    🎮 Determines the best device for model inference.
+    Priority:
+    1. CUDA GPU (NVIDIA)
+    2. MPS (Apple Silicon)
+    3. CPU (fallback)
+    Returns:
+        Device string ("cuda", "mps", or "cpu")
+    """
+    if torch.cuda.is_available():
+        device = DeviceType.CUDA.value
+        gpu_name = torch.cuda.get_device_name(0)
+        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+        logger.info(f"🎮 GPU detected: {gpu_name} ({gpu_memory:.1f}GB)")
+        return device
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        device = DeviceType.MPS.value
+        logger.info("🍎 Apple Silicon (MPS) detected")
+        return device
+    else:
+        device = DeviceType.CPU.value
+        logger.info("💻 Using CPU for inference")
+        logger.warning("⚠️ GPU not available - inference will be slower")
+        return device
+def get_memory_stats() -> Dict[str, float]:
+    """
+    📊 Returns current GPU/CPU memory statistics.
+    Returns:
+        Dict with memory stats in GB
+    """
+    stats = {}
+    if torch.cuda.is_available():
+        stats["gpu_allocated_gb"] = torch.cuda.memory_allocated() / 1e9
+        stats["gpu_reserved_gb"] = torch.cuda.memory_reserved() / 1e9
+        stats["gpu_total_gb"] = torch.cuda.get_device_properties(0).total_memory / 1e9
+    # CPU memory (requires psutil)
+    try:
+        import psutil
+        mem = psutil.virtual_memory()
+        stats["cpu_used_gb"] = mem.used / 1e9
+        stats["cpu_total_gb"] = mem.total / 1e9
+        stats["cpu_percent"] = mem.percent
+    except ImportError:
+        pass
+    return stats
+# ============================================================
+# MODEL CLIENT (Individual Model Handler)
+# ============================================================
+@dataclass
+class ModelMetadata:
+    """
+    📋 Metadata about a loaded model.
+    Tracks performance and resource usage.
+    """
+    name: str
+    task: str
+    model_name: str
+    device: str
+    loaded_at: Optional[datetime] = None
+    load_time_seconds: Optional[float] = None
+    memory_usage_gb: Optional[float] = None
+    inference_count: int = 0
+    total_inference_time_ms: float = 0.0
+    @property
+    def avg_inference_time_ms(self) -> float:
+        """Calculate average inference time."""
+        if self.inference_count == 0:
+            return 0.0
+        return self.total_inference_time_ms / self.inference_count
+class ModelClient:
+    """
+    🤖 Manages a single HuggingFace model with optimized loading and inference.
+    Features:
+    - Lazy loading (load on first use)
+    - Memory optimization (8-bit quantization)
+    - Performance tracking
+    - Graceful error handling
+    - Automatic device placement
+    """
+    def __init__(
+        self,
+        name: str,
+        model_name: str,
+        task: str,
+        device: str = None,
+        config: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Initialize model client (doesn't load the model yet).
+        Args:
+            name: Model identifier (e.g., "penny-core-agent")
+            model_name: HuggingFace model ID
+            task: Task type (text-generation, translation, etc.)
+            device: Target device (auto-detected if None)
+            config: Additional model configuration
+        """
+        self.name = name
+        self.model_name = model_name
+        self.task = task
+        self.device = device or get_optimal_device()
+        self.config = config or {}
+        self.pipeline = None
+        self._load_attempted = False
+        self.metadata = ModelMetadata(
+            name=name,
+            task=task,
+            model_name=model_name,
+            device=self.device
+        )
+        logger.info(f"📦 Initialized ModelClient: {name}")
+        logger.debug(f"   Model: {model_name}")
+        logger.debug(f"   Task: {task}")
+        logger.debug(f"   Device: {self.device}")
+    def load_pipeline(self) -> bool:
+        """
+        🔄 Loads the HuggingFace pipeline with Azure-optimized settings.
+        Features:
+        - 8-bit quantization for large models (saves ~50% memory)
+        - Automatic device placement
+        - Memory monitoring
+        - Cache checking
+        Returns:
+            True if successful, False otherwise
+        """
+        if self.pipeline is not None:
+            logger.debug(f"✅ {self.name} already loaded")
+            return True
+        if self._load_attempted:
+            logger.warning(f"⚠️ Previous load attempt failed for {self.name}")
+            return False
+        global _MODEL_CACHE, _LOAD_TIMES
+        # Check cache first
+        if self.name in _MODEL_CACHE:
+            logger.info(f"♻️ Using cached pipeline for {self.name}")
+            self.pipeline = _MODEL_CACHE[self.name]
+            return True
+        logger.info(f"🔄 Loading {self.name} from HuggingFace...")
+        self._load_attempted = True
+        start_time = datetime.now()
+        try:
+            # Import pipeline from transformers (lazy import to avoid dependency issues)
+            from transformers import pipeline
+            # === TEXT GENERATION (Gemma 7B, GPT-2, etc.) ===
+            if self.task == "text-generation":
+                logger.info("   Using 8-bit quantization for memory efficiency...")
+                # Check if model supports 8-bit loading
+                use_8bit = self.device == DeviceType.CUDA.value
+                if use_8bit:
+                    self.pipeline = pipeline(
+                        "text-generation",
+                        model=self.model_name,
+                        tokenizer=self.model_name,
+                        device_map="auto",
+                        load_in_8bit=True,  # Reduces ~14GB to ~7GB
+                        trust_remote_code=True,
+                        torch_dtype=torch.float16
+                    )
+                else:
+                    # CPU fallback
+                    self.pipeline = pipeline(
+                        "text-generation",
+                        model=self.model_name,
+                        tokenizer=self.model_name,
+                        device=-1,  # CPU
+                        trust_remote_code=True,
+                        torch_dtype=torch.float32
+                    )
+            # === TRANSLATION (NLLB-200, M2M-100, etc.) ===
+            elif self.task == "translation":
+                self.pipeline = pipeline(
+                    "translation",
+                    model=self.model_name,
+                    device=0 if self.device == DeviceType.CUDA.value else -1,
+                    src_lang=self.config.get("default_src_lang", "eng_Latn"),
+                    tgt_lang=self.config.get("default_tgt_lang", "spa_Latn")
+                )
+            # === SENTIMENT ANALYSIS ===
+            elif self.task == "sentiment-analysis":
+                self.pipeline = pipeline(
+                    "sentiment-analysis",
+                    model=self.model_name,
+                    device=0 if self.device == DeviceType.CUDA.value else -1,
+                    truncation=True,
+                    max_length=512
+                )
+            # === BIAS DETECTION (Zero-Shot Classification) ===
+            elif self.task == "bias-detection":
+                self.pipeline = pipeline(
+                    "zero-shot-classification",
+                    model=self.model_name,
+                    device=0 if self.device == DeviceType.CUDA.value else -1
+                )
+            # === TEXT CLASSIFICATION (Generic) ===
+            elif self.task == "text-classification":
+                self.pipeline = pipeline(
+                    "text-classification",
+                    model=self.model_name,
+                    device=0 if self.device == DeviceType.CUDA.value else -1,
+                    truncation=True
+                )
+            # === PDF/DOCUMENT EXTRACTION (LayoutLMv3) ===
+            elif self.task == "pdf-extraction":
+                logger.warning("⚠️ PDF extraction requires additional OCR setup")
+                logger.info("   Consider using Azure Form Recognizer as alternative")
+                # Placeholder - requires pytesseract/OCR infrastructure
+                self.pipeline = None
+                return False
+            else:
+                raise ValueError(f"Unknown task type: {self.task}")
+            # === SUCCESS HANDLING ===
+            if self.pipeline is not None:
+                # Calculate load time
+                load_time = (datetime.now() - start_time).total_seconds()
+                self.metadata.loaded_at = datetime.now()
+                self.metadata.load_time_seconds = load_time
+                # Cache the pipeline
+                _MODEL_CACHE[self.name] = self.pipeline
+                _LOAD_TIMES[self.name] = load_time
+                # Log memory usage
+                mem_stats = get_memory_stats()
+                self.metadata.memory_usage_gb = mem_stats.get("gpu_allocated_gb", 0)
+                logger.info(f"✅ {self.name} loaded successfully!")
+                logger.info(f"   Load time: {load_time:.2f}s")
+                if "gpu_allocated_gb" in mem_stats:
+                    logger.info(
+                        f"   GPU Memory: {mem_stats['gpu_allocated_gb']:.2f}GB / "
+                        f"{mem_stats['gpu_total_gb']:.2f}GB"
+                    )
+                return True
+        except Exception as e:
+            logger.error(f"❌ Failed to load {self.name}: {e}", exc_info=True)
+            self.pipeline = None
+            return False
+    def predict(
+        self,
+        input_data: Union[str, Dict[str, Any]],
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        🎯 Runs inference with the loaded model pipeline.
+        Features:
+        - Automatic pipeline loading
+        - Error handling with fallback responses
+        - Performance tracking
+        - Penny's personality injection (for text-generation)
+        Args:
+            input_data: Text or structured input for the model
+            **kwargs: Task-specific parameters
+        Returns:
+            Model output dict with results or error information
+        """
+        # Track inference start time
+        start_time = datetime.now()
+        # Ensure pipeline is loaded
+        if self.pipeline is None:
+            success = self.load_pipeline()
+            if not success:
+                return {
+                    "error": f"{self.name} pipeline unavailable",
+                    "detail": "Model failed to load. Check logs for details.",
+                    "model": self.name
+                }
+        try:
+            # === TEXT GENERATION ===
+            if self.task == "text-generation":
+                # Inject Penny's civic identity
+                if not kwargs.get("skip_system_prompt", False):
+                    full_prompt = PENNY_SYSTEM_PROMPT + input_data
+                else:
+                    full_prompt = input_data
+                # Extract generation parameters with safe defaults
+                max_new_tokens = kwargs.get("max_new_tokens", 256)
+                temperature = kwargs.get("temperature", 0.7)
+                top_p = kwargs.get("top_p", 0.9)
+                do_sample = kwargs.get("do_sample", temperature > 0.0)
+                result = self.pipeline(
+                    full_prompt,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=do_sample,
+                    return_full_text=False,
+                    pad_token_id=self.pipeline.tokenizer.eos_token_id,
+                    truncation=True
+                )
+                output = {
+                    "generated_text": result[0]["generated_text"],
+                    "model": self.name,
+                    "success": True
+                }
+            # === TRANSLATION ===
+            elif self.task == "translation":
+                src_lang = kwargs.get("source_lang", "eng_Latn")
+                tgt_lang = kwargs.get("target_lang", "spa_Latn")
+                result = self.pipeline(
+                    input_data,
+                    src_lang=src_lang,
+                    tgt_lang=tgt_lang,
+                    max_length=512
+                )
+                output = {
+                    "translation": result[0]["translation_text"],
+                    "source_lang": src_lang,
+                    "target_lang": tgt_lang,
+                    "model": self.name,
+                    "success": True
+                }
+            # === SENTIMENT ANALYSIS ===
+            elif self.task == "sentiment-analysis":
+                result = self.pipeline(input_data)
+                output = {
+                    "sentiment": result[0]["label"],
+                    "confidence": result[0]["score"],
+                    "model": self.name,
+                    "success": True
+                }
+            # === BIAS DETECTION ===
+            elif self.task == "bias-detection":
+                candidate_labels = kwargs.get("candidate_labels", [
+                    "neutral and objective",
+                    "contains political bias",
+                    "uses emotional language",
+                    "culturally insensitive"
+                ])
+                result = self.pipeline(
+                    input_data,
+                    candidate_labels=candidate_labels,
+                    multi_label=True
+                )
+                output = {
+                    "labels": result["labels"],
+                    "scores": result["scores"],
+                    "model": self.name,
+                    "success": True
+                }
+            # === TEXT CLASSIFICATION ===
+            elif self.task == "text-classification":
+                result = self.pipeline(input_data)
+                output = {
+                    "label": result[0]["label"],
+                    "confidence": result[0]["score"],
+                    "model": self.name,
+                    "success": True
+                }
+            else:
+                output = {
+                    "error": f"Task '{self.task}' not implemented",
+                    "model": self.name,
+                    "success": False
+                }
+            # Track performance
+            inference_time = (datetime.now() - start_time).total_seconds() * 1000
+            self.metadata.inference_count += 1
+            self.metadata.total_inference_time_ms += inference_time
+            output["inference_time_ms"] = round(inference_time, 2)
+            return output
+        except Exception as e:
+            logger.error(f"❌ Inference error in {self.name}: {e}", exc_info=True)
+            return {
+                "error": "Inference failed",
+                "detail": str(e),
+                "model": self.name,
+                "success": False
+            }
+    def unload(self) -> None:
+        """
+        🗑️ Unloads the model to free memory.
+        Critical for Azure environments with limited resources.
+        """
+        if self.pipeline is not None:
+            logger.info(f"🗑️ Unloading {self.name}...")
+            # Delete pipeline
+            del self.pipeline
+            self.pipeline = None
+            # Remove from cache
+            if self.name in _MODEL_CACHE:
+                del _MODEL_CACHE[self.name]
+            # Force GPU memory release
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            logger.info(f"✅ {self.name} unloaded successfully")
+            # Log memory stats after unload
+            mem_stats = get_memory_stats()
+            if "gpu_allocated_gb" in mem_stats:
+                logger.info(f"   GPU Memory: {mem_stats['gpu_allocated_gb']:.2f}GB remaining")
+    def get_metadata(self) -> Dict[str, Any]:
+        """
+        📊 Returns model metadata and performance stats.
+        """
+        return {
+            "name": self.metadata.name,
+            "task": self.metadata.task,
+            "model_name": self.metadata.model_name,
+            "device": self.metadata.device,
+            "loaded": self.pipeline is not None,
+            "loaded_at": self.metadata.loaded_at.isoformat() if self.metadata.loaded_at else None,
+            "load_time_seconds": self.metadata.load_time_seconds,
+            "memory_usage_gb": self.metadata.memory_usage_gb,
+            "inference_count": self.metadata.inference_count,
+            "avg_inference_time_ms": round(self.metadata.avg_inference_time_ms, 2)
+        }
+# ============================================================
+# MODEL LOADER (Singleton Manager)
+# ============================================================
+class ModelLoader:
+    """
+    🎛️ Singleton manager for all Penny's specialized models.
+    Features:
+    - Centralized model configuration
+    - Lazy loading (models only load when needed)
+    - Memory management
+    - Health monitoring
+    - Unified access interface
+    """
+    _instance: Optional['ModelLoader'] = None
+    def __new__(cls, *args, **kwargs):
+        """Singleton pattern - only one ModelLoader instance."""
+        if cls._instance is None:
+            cls._instance = super(ModelLoader, cls).__new__(cls)
+        return cls._instance
+    def __init__(self, config_path: Optional[str] = None):
+        """
+        Initialize ModelLoader (only runs once due to singleton).
+        Args:
+            config_path: Path to model_config.json (optional)
+        """
+        if not hasattr(self, '_models_loaded'):
+            self.models: Dict[str, ModelClient] = {}
+            self._models_loaded = True
+            self._initialization_time = datetime.now()
+            # Use provided path or default
+            config_file = Path(config_path) if config_path else CONFIG_PATH
+            try:
+                logger.info(f"📖 Loading model configuration from {config_file}")
+                if not config_file.exists():
+                    logger.warning(f"⚠️ Configuration file not found: {config_file}")
+                    logger.info("   Create model_config.json with your model definitions")
+                    return
+                with open(config_file, "r") as f:
+                    config = json.load(f)
+                # Initialize ModelClients (doesn't load models yet)
+                for model_id, model_info in config.items():
+                    self.models[model_id] = ModelClient(
+                        name=model_id,
+                        model_name=model_info["model_name"],
+                        task=model_info["task"],
+                        config=model_info.get("config", {})
+                    )
+                logger.info(f"✅ ModelLoader initialized with {len(self.models)} models:")
+                for model_id in self.models.keys():
+                    logger.info(f"   - {model_id}")
+            except json.JSONDecodeError as e:
+                logger.error(f"❌ Invalid JSON in model_config.json: {e}")
+            except Exception as e:
+                logger.error(f"❌ Failed to initialize ModelLoader: {e}", exc_info=True)
+    def get(self, model_id: str) -> Optional[ModelClient]:
+        """
+        🎯 Retrieves a configured ModelClient by ID.
+        Args:
+            model_id: Model identifier from config
+        Returns:
+            ModelClient instance or None if not found
+        """
+        return self.models.get(model_id)
+    def list_models(self) -> List[str]:
+        """📋 Returns list of all available model IDs."""
+        return list(self.models.keys())
+    def get_loaded_models(self) -> List[str]:
+        """📋 Returns list of currently loaded model IDs."""
+        return [
+            model_id
+            for model_id, client in self.models.items()
+            if client.pipeline is not None
+        ]
+    def unload_all(self) -> None:
+        """
+        🗑️ Unloads all models to free memory.
+        Useful for Azure environments when switching workloads.
+        """
+        logger.info("🗑️ Unloading all models...")
+        for model_client in self.models.values():
+            model_client.unload()
+        logger.info("✅ All models unloaded")
+    def get_status(self) -> Dict[str, Any]:
+        """
+        📊 Returns comprehensive status of all models.
+        Useful for health checks and monitoring.
+        """
+        status = {
+            "initialization_time": self._initialization_time.isoformat(),
+            "total_models": len(self.models),
+            "loaded_models": len(self.get_loaded_models()),
+            "device": get_optimal_device(),
+            "memory": get_memory_stats(),
+            "models": {}
+        }
+        for model_id, client in self.models.items():
+            status["models"][model_id] = client.get_metadata()
+        return status
+# ============================================================
+# PUBLIC INTERFACE (Used by all *_utils.py modules)
+# ============================================================
+def load_model_pipeline(agent_name: str) -> Callable[..., Dict[str, Any]]:
+    """
+    🚀 Loads a model client and returns its inference function.
+    This is the main function used by other modules (translation_utils.py,
+    sentiment_utils.py, etc.) to access Penny's models.
+    Args:
+        agent_name: Model ID from model_config.json
+    Returns:
+        Callable inference function
+    Raises:
+        ValueError: If agent_name not found in configuration
+    Example:
+        >>> translator = load_model_pipeline("penny-translate-agent")
+        >>> result = translator("Hello world", target_lang="spa_Latn")
+    """
+    loader = ModelLoader()
+    client = loader.get(agent_name)
+    if client is None:
+        available = loader.list_models()
+        raise ValueError(
+            f"Agent ID '{agent_name}' not found in model configuration. "
+            f"Available models: {available}"
+        )
+    # Load the pipeline (lazy loading)
+    client.load_pipeline()
+    # Return a callable wrapper
+    def inference_wrapper(input_data, **kwargs):
+        return client.predict(input_data, **kwargs)
+    return inference_wrapper
+# === CONVENIENCE FUNCTIONS ===
+def get_model_status() -> Dict[str, Any]:
+    """
+    📊 Returns status of all configured models.
+    Useful for health checks and monitoring endpoints.
+    """
+    loader = ModelLoader()
+    return loader.get_status()
+def preload_models(model_ids: Optional[List[str]] = None) -> None:
+    """
+    🚀 Preloads specified models during startup.
+    Args:
+        model_ids: List of model IDs to preload (None = all models)
+    """
+    loader = ModelLoader()
+    if model_ids is None:
+        model_ids = loader.list_models()
+    logger.info(f"🚀 Preloading {len(model_ids)} models...")
+    for model_id in model_ids:
+        client = loader.get(model_id)
+        if client:
+            logger.info(f"   Loading {model_id}...")
+            client.load_pipeline()
+    logger.info("✅ Model preloading complete")
+def initialize_model_system() -> bool:
+    """
+    🏁 Initializes the model system.
+    Should be called during app startup.
+    Returns:
+        True if initialization successful
+    """
+    logger.info("🧠 Initializing Penny's model system...")
+    try:
+        # Initialize singleton
+        loader = ModelLoader()
+        # Log device info
+        device = get_optimal_device()
+        mem_stats = get_memory_stats()
+        logger.info(f"✅ Model system initialized")
+        logger.info(f"🎮 Compute device: {device}")
+        if "gpu_total_gb" in mem_stats:
+            logger.info(
+                f"💾 GPU Memory: {mem_stats['gpu_total_gb']:.1f}GB total"
+            )
+        logger.info(f"📦 {len(loader.models)} models configured")
+        # Optional: Preload critical models
+        # Uncomment to preload models at startup
+        # preload_models(["penny-core-agent"])
+        return True
+    except Exception as e:
+        logger.error(f"❌ Failed to initialize model system: {e}", exc_info=True)
+        return False
+# ============================================================
+# CLI TESTING & DEBUGGING
+# ============================================================
+if __name__ == "__main__":
+    """
+    🧪 Test script for model loading and inference.
+    Run with: python -m app.model_loader
+    """
+    print("=" * 60)
+    print("🧪 Testing Penny's Model System")
+    print("=" * 60)
+    # Initialize
+    loader = ModelLoader()
+    print(f"\n📋 Available models: {loader.list_models()}")
+    # Get status
+    status = get_model_status()
+    print(f"\n📊 System status:")
+    print(json.dumps(status, indent=2, default=str))
+    # Test model loading (if models configured)
+    if loader.models:
+        test_model_id = list(loader.models.keys())[0]
+        print(f"\n🧪 Testing model: {test_model_id}")
+        client = loader.get(test_model_id)
+        if client:
+            print(f"   Loading pipeline...")
+            success = client.load_pipeline()
+            if success:
+                print(f"   ✅ Model loaded successfully!")
+                print(f"   Metadata: {json.dumps(client.get_metadata(), indent=2, default=str)}")
+            else:
+                print(f"   ❌ Model loading failed")

app/orchestrator.py ADDED Viewed

	@@ -0,0 +1,1410 @@

+"""
+🎭 PENNY Orchestrator - Request Routing & Coordination Engine
+This is Penny's decision-making brain. She analyzes each request, determines
+the best way to help, and coordinates between her specialized AI models and
+civic data tools.
+MISSION: Route every resident request to the right resource while maintaining
+Penny's warm, helpful personality and ensuring fast, accurate responses.
+FEATURES:
+- Enhanced intent classification with confidence scoring
+- Compound intent handling (weather + events)
+- Graceful fallbacks when services are unavailable
+- Performance tracking for all operations
+- Context-aware responses
+- Emergency routing with immediate escalation
+ENHANCEMENTS (Phase 1):
+- ✅ Structured logging with performance tracking
+- ✅ Safe imports with availability flags
+- ✅ Result format checking helper
+- ✅ Enhanced error handling patterns
+- ✅ Service availability tracking
+- ✅ Fixed function signature mismatches
+- ✅ Integration with enhanced modules
+"""
+import logging
+import time
+from typing import Dict, Any, Optional, List, Tuple
+from datetime import datetime
+from dataclasses import dataclass, field
+from enum import Enum
+# --- ENHANCED MODULE IMPORTS ---
+from app.intents import classify_intent_detailed, IntentType, IntentMatch
+from app.location_utils import (
+    extract_location_detailed,
+    LocationMatch,
+    LocationStatus,
+    get_city_coordinates
+)
+from app.logging_utils import (
+    log_interaction,
+    sanitize_for_logging,
+    LogLevel
+)
+# --- AGENT IMPORTS (with availability tracking) ---
+try:
+    from app.weather_agent import (
+        get_weather_for_location,
+        recommend_outfit,
+        weather_to_event_recommendations,
+        format_weather_summary
+    )
+    WEATHER_AGENT_AVAILABLE = True
+except ImportError as e:
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Weather agent not available: {e}")
+    WEATHER_AGENT_AVAILABLE = False
+try:
+    from app.event_weather import get_event_recommendations_with_weather
+    EVENT_WEATHER_AVAILABLE = True
+except ImportError as e:
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Event weather integration not available: {e}")
+    EVENT_WEATHER_AVAILABLE = False
+try:
+    from app.tool_agent import handle_tool_request
+    TOOL_AGENT_AVAILABLE = True
+except ImportError as e:
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Tool agent not available: {e}")
+    TOOL_AGENT_AVAILABLE = False
+# --- MODEL IMPORTS (with availability tracking) ---
+try:
+    from models.translation.translation_utils import translate_text
+    TRANSLATION_AVAILABLE = True
+except ImportError as e:
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Translation service not available: {e}")
+    TRANSLATION_AVAILABLE = False
+try:
+    from models.sentiment.sentiment_utils import get_sentiment_analysis
+    SENTIMENT_AVAILABLE = True
+except ImportError as e:
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Sentiment service not available: {e}")
+    SENTIMENT_AVAILABLE = False
+try:
+    from models.bias.bias_utils import check_bias
+    BIAS_AVAILABLE = True
+except ImportError as e:
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Bias detection service not available: {e}")
+    BIAS_AVAILABLE = False
+try:
+    from models.gemma.gemma_utils import generate_response
+    LLM_AVAILABLE = True
+except ImportError as e:
+    logger = logging.getLogger(__name__)
+    logger.warning(f"LLM service not available: {e}")
+    LLM_AVAILABLE = False
+# --- LOGGING SETUP ---
+logger = logging.getLogger(__name__)
+# --- CONFIGURATION ---
+CORE_MODEL_ID = "penny-core-agent"
+MAX_RESPONSE_TIME_MS = 5000  # 5 seconds - log if exceeded
+# --- TRACKING COUNTERS ---
+_orchestration_count = 0
+_emergency_count = 0
+# ============================================================
+# COMPATIBILITY HELPER - Result Format Checking
+# ============================================================
+def _check_result_success(
+    result: Dict[str, Any],
+    expected_keys: List[str]
+) -> Tuple[bool, Optional[str]]:
+    """
+    ✅ Check if a utility function result indicates success.
+    Handles multiple return format patterns:
+    - Explicit "success" key (preferred)
+    - Presence of expected data keys (implicit success)
+    - Presence of "error" key (explicit failure)
+    This helper fixes compatibility issues where different utility
+    functions return different result formats.
+    Args:
+        result: Dictionary returned from utility function
+        expected_keys: List of keys that indicate successful data
+    Returns:
+        Tuple of (is_success, error_message)
+    Example:
+        result = await translate_text(message, "en", "es")
+        success, error = _check_result_success(result, ["translated_text"])
+        if success:
+            text = result.get("translated_text")
+    """
+    # Check for explicit success key
+    if "success" in result:
+        return result["success"], result.get("error")
+    # Check for explicit error (presence = failure)
+    if "error" in result and result["error"]:
+        return False, result["error"]
+    # Check for expected data keys (implicit success)
+    has_data = any(key in result for key in expected_keys)
+    if has_data:
+        return True, None
+    # Unknown format - assume failure
+    return False, "Unexpected response format"
+# ============================================================
+# SERVICE AVAILABILITY CHECK
+# ============================================================
+def get_service_availability() -> Dict[str, bool]:
+    """
+    📊 Returns which services are currently available.
+    Used for health checks, debugging, and deciding whether
+    to attempt service calls or use fallbacks.
+    Returns:
+        Dictionary mapping service names to availability status
+    """
+    return {
+        "translation": TRANSLATION_AVAILABLE,
+        "sentiment": SENTIMENT_AVAILABLE,
+        "bias_detection": BIAS_AVAILABLE,
+        "llm": LLM_AVAILABLE,
+        "tool_agent": TOOL_AGENT_AVAILABLE,
+        "weather": WEATHER_AGENT_AVAILABLE,
+        "event_weather": EVENT_WEATHER_AVAILABLE
+    }
+# ============================================================
+# ORCHESTRATION RESULT STRUCTURE
+# ============================================================
+@dataclass
+class OrchestrationResult:
+    """
+    📦 Structured result from orchestration pipeline.
+    This format is used throughout the system for consistency
+    and makes it easy to track what happened during request processing.
+    """
+    intent: str                                  # Detected intent
+    reply: str                                   # User-facing response
+    success: bool                                # Whether request succeeded
+    tenant_id: Optional[str] = None              # City/location identifier
+    data: Optional[Dict[str, Any]] = None        # Raw data from services
+    model_id: Optional[str] = None               # Which model/service was used
+    error: Optional[str] = None                  # Error message if failed
+    response_time_ms: Optional[float] = None
+    confidence: Optional[float] = None           # Intent confidence score
+    fallback_used: bool = False                  # True if fallback logic triggered
+    def to_dict(self) -> Dict[str, Any]:
+        """Converts to dictionary for API responses."""
+        return {
+            "intent": self.intent,
+            "reply": self.reply,
+            "success": self.success,
+            "tenant_id": self.tenant_id,
+            "data": self.data,
+            "model_id": self.model_id,
+            "error": self.error,
+            "response_time_ms": self.response_time_ms,
+            "confidence": self.confidence,
+            "fallback_used": self.fallback_used
+        }
+# ============================================================
+# MAIN ORCHESTRATOR FUNCTION (ENHANCED)
+# ============================================================
+async def run_orchestrator(
+    message: str,
+    context: Dict[str, Any] = None
+) -> Dict[str, Any]:
+    """
+    🧠 Main decision-making brain of Penny.
+    This function:
+    1. Analyzes the user's message to determine intent
+    2. Extracts location/city information
+    3. Routes to the appropriate specialized service
+    4. Handles errors gracefully with helpful fallbacks
+    5. Tracks performance and logs the interaction
+    Args:
+        message: User's input text
+        context: Additional context (tenant_id, lat, lon, session_id, etc.)
+    Returns:
+        Dictionary with response and metadata
+    Example:
+        result = await run_orchestrator(
+            message="What's the weather in Atlanta?",
+            context={"lat": 33.7490, "lon": -84.3880}
+        )
+    """
+    global _orchestration_count
+    _orchestration_count += 1
+    start_time = time.time()
+    # Initialize context if not provided
+    if context is None:
+        context = {}
+    # Sanitize message for logging (PII protection)
+    safe_message = sanitize_for_logging(message)
+    logger.info(f"🎭 Orchestrator processing: '{safe_message[:50]}...'")
+    try:
+        # === STEP 1: CLASSIFY INTENT (Enhanced) ===
+        intent_result = classify_intent_detailed(message)
+        intent = intent_result.intent
+        confidence = intent_result.confidence
+        logger.info(
+            f"Intent detected: {intent.value} "
+            f"(confidence: {confidence:.2f})"
+        )
+        # === STEP 2: EXTRACT LOCATION ===
+        tenant_id = context.get("tenant_id")
+        lat = context.get("lat")
+        lon = context.get("lon")
+        # If tenant_id not provided, try to extract from message
+        if not tenant_id or tenant_id == "unknown":
+            location_result = extract_location_detailed(message)
+            if location_result.status == LocationStatus.FOUND:
+                tenant_id = location_result.tenant_id
+                logger.info(f"Location extracted: {tenant_id}")
+                # Get coordinates for this tenant if available
+                coords = get_city_coordinates(tenant_id)
+                if coords and lat is None and lon is None:
+                    lat, lon = coords["lat"], coords["lon"]
+                    logger.info(f"Coordinates loaded: {lat}, {lon}")
+            elif location_result.status == LocationStatus.USER_LOCATION_NEEDED:
+                logger.info("User location services needed")
+            else:
+                logger.info(f"No location detected: {location_result.status}")
+        # === STEP 3: HANDLE EMERGENCY INTENTS (CRITICAL) ===
+        if intent == IntentType.EMERGENCY:
+            result = await _handle_emergency(
+                message=message,
+                context=context,
+                start_time=start_time
+            )
+            # Set confidence and metadata before returning
+            result.confidence = confidence
+            result.tenant_id = tenant_id
+            response_time = (time.time() - start_time) * 1000
+            result.response_time_ms = round(response_time, 2)
+            return result.to_dict()
+        # === STEP 4: ROUTE TO APPROPRIATE HANDLER ===
+        # Translation
+        if intent == IntentType.TRANSLATION:
+            result = await _handle_translation(message, context)
+        # Sentiment Analysis
+        elif intent == IntentType.SENTIMENT_ANALYSIS:
+            result = await _handle_sentiment(message, context)
+        # Bias Detection
+        elif intent == IntentType.BIAS_DETECTION:
+            result = await _handle_bias(message, context)
+        # Document Processing
+        elif intent == IntentType.DOCUMENT_PROCESSING:
+            result = await _handle_document(message, context)
+        # Weather (includes compound weather+events handling)
+        elif intent == IntentType.WEATHER:
+            result = await _handle_weather(
+                message=message,
+                context=context,
+                tenant_id=tenant_id,
+                lat=lat,
+                lon=lon,
+                intent_result=intent_result
+            )
+        # Events
+        elif intent == IntentType.EVENTS:
+            result = await _handle_events(
+                message=message,
+                context=context,
+                tenant_id=tenant_id,
+                lat=lat,
+                lon=lon,
+                intent_result=intent_result
+            )
+        # Local Resources
+        elif intent == IntentType.LOCAL_RESOURCES:
+            result = await _handle_local_resources(
+                message=message,
+                context=context,
+                tenant_id=tenant_id,
+                lat=lat,
+                lon=lon
+            )
+        # Greeting, Help, Unknown
+        elif intent in [IntentType.GREETING, IntentType.HELP, IntentType.UNKNOWN]:
+            result = await _handle_conversational(
+                message=message,
+                intent=intent,
+                context=context
+            )
+        else:
+            # Unhandled intent type (shouldn't happen, but safety net)
+            result = await _handle_fallback(message, intent, context)
+        # === STEP 5: ADD METADATA & LOG INTERACTION ===
+        response_time = (time.time() - start_time) * 1000
+        result.response_time_ms = round(response_time, 2)
+        result.confidence = confidence
+        result.tenant_id = tenant_id
+        # Log the interaction with structured logging
+        log_interaction(
+            tenant_id=tenant_id or "unknown",
+            interaction_type="orchestration",
+            intent=intent.value,
+            response_time_ms=response_time,
+            success=result.success,
+            metadata={
+                "confidence": confidence,
+                "fallback_used": result.fallback_used,
+                "model_id": result.model_id,
+                "orchestration_count": _orchestration_count
+            }
+        )
+        # Log slow responses
+        if response_time > MAX_RESPONSE_TIME_MS:
+            logger.warning(
+                f"⚠️ Slow response: {response_time:.0f}ms "
+                f"(intent: {intent.value})"
+            )
+        logger.info(
+            f"✅ Orchestration complete: {intent.value} "
+            f"({response_time:.0f}ms)"
+        )
+        return result.to_dict()
+    except Exception as e:
+        # === CATASTROPHIC FAILURE HANDLER ===
+        response_time = (time.time() - start_time) * 1000
+        logger.error(
+            f"❌ Orchestrator error: {e} "
+            f"(response_time: {response_time:.0f}ms)",
+            exc_info=True
+        )
+        # Log failed interaction
+        log_interaction(
+            tenant_id=context.get("tenant_id", "unknown"),
+            interaction_type="orchestration_error",
+            intent="error",
+            response_time_ms=response_time,
+            success=False,
+            metadata={
+                "error": str(e),
+                "error_type": type(e).__name__
+            }
+        )
+        error_result = OrchestrationResult(
+            intent="error",
+            reply=(
+                "I'm having trouble processing your request right now. "
+                "Please try again in a moment, or let me know if you need "
+                "immediate assistance! 💛"
+            ),
+            success=False,
+            error=str(e),
+            model_id="orchestrator",
+            fallback_used=True,
+            response_time_ms=round(response_time, 2)
+        )
+        return error_result.to_dict()
+# ============================================================
+# SPECIALIZED INTENT HANDLERS (ENHANCED)
+# ============================================================
+async def _handle_emergency(
+    message: str,
+    context: Dict[str, Any],
+    start_time: float
+) -> OrchestrationResult:
+    """
+    🚨 CRITICAL: Emergency intent handler.
+    This function handles crisis situations with immediate routing
+    to appropriate services. All emergency interactions are logged
+    for compliance and safety tracking.
+    IMPORTANT: This is a compliance-critical function. All emergency
+    interactions must be logged and handled with priority.
+    """
+    global _emergency_count
+    _emergency_count += 1
+    # Sanitize message for logging (but keep full context for safety review)
+    safe_message = sanitize_for_logging(message)
+    logger.warning(f"🚨 EMERGENCY INTENT DETECTED (#{_emergency_count}): {safe_message[:100]}")
+    # TODO: Integrate with safety_utils.py when enhanced
+    # from app.safety_utils import route_emergency
+    # result = await route_emergency(message, context)
+    # For now, provide crisis resources
+    reply = (
+        "🚨 **If this is a life-threatening emergency, please call 911 immediately.**\n\n"
+        "For crisis support:\n"
+        "- **National Suicide Prevention Lifeline:** 988\n"
+        "- **Crisis Text Line:** Text HOME to 741741\n"
+        "- **National Domestic Violence Hotline:** 1-800-799-7233\n\n"
+        "I'm here to help connect you with local resources. "
+        "What kind of support do you need right now?"
+    )
+    # Log emergency interaction for compliance (CRITICAL)
+    response_time = (time.time() - start_time) * 1000
+    log_interaction(
+        tenant_id=context.get("tenant_id", "emergency"),
+        interaction_type="emergency",
+        intent=IntentType.EMERGENCY.value,
+        response_time_ms=response_time,
+        success=True,
+        metadata={
+            "emergency_number": _emergency_count,
+            "message_length": len(message),
+            "timestamp": datetime.now().isoformat(),
+            "action": "crisis_resources_provided"
+        }
+    )
+    logger.critical(
+        f"EMERGENCY LOG #{_emergency_count}: Resources provided "
+        f"({response_time:.0f}ms)"
+    )
+    return OrchestrationResult(
+        intent=IntentType.EMERGENCY.value,
+        reply=reply,
+        success=True,
+        model_id="emergency_router",
+        data={"crisis_resources_provided": True},
+        response_time_ms=round(response_time, 2)
+    )
+async def _handle_translation(
+    message: str,
+    context: Dict[str, Any]
+) -> OrchestrationResult:
+    """
+    🌍 Translation handler - 27 languages supported.
+    Handles translation requests with graceful fallback if service
+    is unavailable.
+    """
+    logger.info("🌍 Processing translation request")
+    # Check service availability first
+    if not TRANSLATION_AVAILABLE:
+        logger.warning("Translation service not available")
+        return OrchestrationResult(
+            intent=IntentType.TRANSLATION.value,
+            reply="Translation isn't available right now. Try again soon! 🌍",
+            success=False,
+            error="Service not loaded",
+            fallback_used=True
+        )
+    try:
+        # Extract language parameters from context or parse from message
+        source_lang = context.get("source_lang", "eng_Latn")
+        target_lang = context.get("target_lang", "spa_Latn")
+        # Parse target language from message if present
+        # Examples: "translate to Spanish", "in Spanish", "to Spanish"
+        message_lower = message.lower()
+        language_keywords = {
+            "spanish": "spa_Latn", "español": "spa_Latn", "es": "spa_Latn",
+            "french": "fra_Latn", "français": "fra_Latn", "fr": "fra_Latn",
+            "chinese": "zho_Hans", "mandarin": "zho_Hans", "zh": "zho_Hans",
+            "arabic": "arb_Arab", "ar": "arb_Arab",
+            "hindi": "hin_Deva", "hi": "hin_Deva",
+            "portuguese": "por_Latn", "pt": "por_Latn",
+            "russian": "rus_Cyrl", "ru": "rus_Cyrl",
+            "german": "deu_Latn", "de": "deu_Latn",
+            "vietnamese": "vie_Latn", "vi": "vie_Latn",
+            "tagalog": "tgl_Latn", "tl": "tgl_Latn",
+            "urdu": "urd_Arab", "ur": "urd_Arab",
+            "swahili": "swh_Latn", "sw": "swh_Latn",
+            "english": "eng_Latn", "en": "eng_Latn"
+        }
+        # Check for "to [language]" or "in [language]" patterns
+        for lang_name, lang_code in language_keywords.items():
+            if f"to {lang_name}" in message_lower or f"in {lang_name}" in message_lower:
+                target_lang = lang_code
+                logger.info(f"🌍 Detected target language from message: {lang_name} -> {lang_code}")
+                break
+        result = await translate_text(message, source_lang, target_lang)
+        # Check if translation service was actually available
+        if not result.get("available", True):
+            error_msg = result.get("error", "Translation service is temporarily unavailable.")
+            logger.warning(f"Translation service unavailable: {error_msg}")
+            return OrchestrationResult(
+                intent=IntentType.TRANSLATION.value,
+                reply=(
+                    "I'm having trouble accessing the translation service right now. "
+                    "Please try again in a moment! 🌍"
+                ),
+                success=False,
+                error=error_msg,
+                fallback_used=True
+            )
+        # Use compatibility helper to check result
+        success, error = _check_result_success(result, ["translated_text"])
+        if success:
+            translated = result.get("translated_text", "")
+            # Check if translation was skipped (same source/target language)
+            if result.get("skipped", False):
+                reply = (
+                    f"The text is already in {target_lang}. "
+                    f"No translation needed! 🌍"
+                )
+            else:
+                reply = (
+                    f"Here's the translation:\n\n"
+                    f"**{translated}**\n\n"
+                    f"(Translated from {source_lang} to {target_lang})"
+                )
+            return OrchestrationResult(
+                intent=IntentType.TRANSLATION.value,
+                reply=reply,
+                success=True,
+                data=result,
+                model_id="penny-translate-agent"
+            )
+        else:
+            raise Exception(error or "Translation failed")
+    except Exception as e:
+        logger.error(f"Translation error: {e}", exc_info=True)
+        return OrchestrationResult(
+            intent=IntentType.TRANSLATION.value,
+            reply=(
+                "I had trouble translating that. Could you rephrase? 💬"
+            ),
+            success=False,
+            error=str(e),
+            fallback_used=True
+        )
+async def _handle_sentiment(
+    message: str,
+    context: Dict[str, Any]
+) -> OrchestrationResult:
+    """
+    😊 Sentiment analysis handler.
+    Analyzes the emotional tone of text with graceful fallback
+    if service is unavailable.
+    """
+    logger.info("😊 Processing sentiment analysis")
+    # Check service availability first
+    if not SENTIMENT_AVAILABLE:
+        logger.warning("Sentiment service not available")
+        return OrchestrationResult(
+            intent=IntentType.SENTIMENT_ANALYSIS.value,
+            reply="Sentiment analysis isn't available right now. Try again soon! 😊",
+            success=False,
+            error="Service not loaded",
+            fallback_used=True
+        )
+    try:
+        result = await get_sentiment_analysis(message)
+        # Use compatibility helper to check result
+        success, error = _check_result_success(result, ["label", "score"])
+        if success:
+            sentiment = result.get("label", "neutral")
+            confidence = result.get("score", 0.0)
+            reply = (
+                f"The overall sentiment detected is: **{sentiment}**\n"
+                f"Confidence: {confidence:.1%}"
+            )
+            return OrchestrationResult(
+                intent=IntentType.SENTIMENT_ANALYSIS.value,
+                reply=reply,
+                success=True,
+                data=result,
+                model_id="penny-sentiment-agent"
+            )
+        else:
+            raise Exception(error or "Sentiment analysis failed")
+    except Exception as e:
+        logger.error(f"Sentiment analysis error: {e}", exc_info=True)
+        return OrchestrationResult(
+            intent=IntentType.SENTIMENT_ANALYSIS.value,
+            reply="I couldn't analyze the sentiment right now. Try again? 😊",
+            success=False,
+            error=str(e),
+            fallback_used=True
+        )
+async def _handle_bias(
+    message: str,
+    context: Dict[str, Any]
+) -> OrchestrationResult:
+    """
+    ⚖️ Bias detection handler.
+    Analyzes text for potential bias patterns with graceful fallback
+    if service is unavailable.
+    """
+    logger.info("⚖️ Processing bias detection")
+    # Check service availability first
+    if not BIAS_AVAILABLE:
+        logger.warning("Bias detection service not available")
+        return OrchestrationResult(
+            intent=IntentType.BIAS_DETECTION.value,
+            reply="Bias detection isn't available right now. Try again soon! ⚖️",
+            success=False,
+            error="Service not loaded",
+            fallback_used=True
+        )
+    try:
+        result = await check_bias(message)
+        # Use compatibility helper to check result
+        success, error = _check_result_success(result, ["analysis"])
+        if success:
+            analysis = result.get("analysis", [])
+            if analysis:
+                top_result = analysis[0]
+                label = top_result.get("label", "unknown")
+                score = top_result.get("score", 0.0)
+                reply = (
+                    f"Bias analysis complete:\n\n"
+                    f"**Most likely category:** {label}\n"
+                    f"**Confidence:** {score:.1%}"
+                )
+            else:
+                reply = "The text appears relatively neutral. ⚖️"
+            return OrchestrationResult(
+                intent=IntentType.BIAS_DETECTION.value,
+                reply=reply,
+                success=True,
+                data=result,
+                model_id="penny-bias-checker"
+            )
+        else:
+            raise Exception(error or "Bias detection failed")
+    except Exception as e:
+        logger.error(f"Bias detection error: {e}", exc_info=True)
+        return OrchestrationResult(
+            intent=IntentType.BIAS_DETECTION.value,
+            reply="I couldn't check for bias right now. Try again? ⚖️",
+            success=False,
+            error=str(e),
+            fallback_used=True
+        )
+async def _handle_document(
+    message: str,
+    context: Dict[str, Any]
+) -> OrchestrationResult:
+    """
+    📄 Document processing handler.
+    Note: Actual file upload happens in router.py via FastAPI.
+    This handler just provides instructions.
+    """
+    logger.info("📄 Document processing requested")
+    reply = (
+        "I can help you process documents! 📄\n\n"
+        "Please upload your document (PDF or image) using the "
+        "`/upload-document` endpoint. I can extract text, analyze forms, "
+        "and help you understand civic documents.\n\n"
+        "What kind of document do you need help with?"
+    )
+    return OrchestrationResult(
+        intent=IntentType.DOCUMENT_PROCESSING.value,
+        reply=reply,
+        success=True,
+        model_id="document_router"
+    )
+async def _handle_weather(
+    message: str,
+    context: Dict[str, Any],
+    tenant_id: Optional[str],
+    lat: Optional[float],
+    lon: Optional[float],
+    intent_result: IntentMatch
+) -> OrchestrationResult:
+    """
+    🌤️ Weather handler with compound intent support.
+    Handles both simple weather queries and compound weather+events queries.
+    Uses enhanced weather_agent.py with caching and performance tracking.
+    """
+    logger.info("🌤️ Processing weather request")
+    # Check service availability first
+    if not WEATHER_AGENT_AVAILABLE:
+        logger.warning("Weather agent not available")
+        return OrchestrationResult(
+            intent=IntentType.WEATHER.value,
+            reply="Weather service isn't available right now. Try again soon! 🌤️",
+            success=False,
+            error="Weather agent not loaded",
+            fallback_used=True
+        )
+    # Check for compound intent (weather + events)
+    is_compound = intent_result.is_compound or IntentType.EVENTS in intent_result.secondary_intents
+    # === ENHANCED LOCATION RESOLUTION ===
+    # Try multiple strategies to get coordinates
+    # Strategy 1: Use provided coordinates
+    if lat is not None and lon is not None:
+        logger.info(f"Using provided coordinates: {lat}, {lon}")
+    # Strategy 2: Get coordinates from tenant_id (try multiple formats)
+    elif tenant_id:
+        # Try tenant_id as-is first
+        coords = get_city_coordinates(tenant_id)
+        # If that fails and tenant_id doesn't have state suffix, try adding common suffixes
+        if not coords and "_" not in tenant_id:
+            # Try common state abbreviations for known cities
+            state_suffixes = ["_va", "_ga", "_al", "_tx", "_ri", "_wa"]
+            for suffix in state_suffixes:
+                test_tenant_id = tenant_id + suffix
+                coords = get_city_coordinates(test_tenant_id)
+                if coords:
+                    tenant_id = test_tenant_id  # Update tenant_id to normalized form
+                    logger.info(f"Normalized tenant_id to {tenant_id}")
+                    break
+        if coords:
+            lat, lon = coords["lat"], coords["lon"]
+            logger.info(f"✅ Using city coordinates for {tenant_id}: {lat}, {lon}")
+    # Strategy 3: Extract location from message if still no coordinates
+    if lat is None or lon is None:
+        logger.info("No coordinates from tenant_id, trying to extract from message")
+        location_result = extract_location_detailed(message)
+        if location_result.status == LocationStatus.FOUND:
+            extracted_tenant_id = location_result.tenant_id
+            logger.info(f"📍 Location extracted from message: {extracted_tenant_id}")
+            # Update tenant_id if we extracted a better one
+            if not tenant_id or tenant_id != extracted_tenant_id:
+                tenant_id = extracted_tenant_id
+                logger.info(f"Updated tenant_id to {tenant_id}")
+            # Get coordinates for extracted location
+            coords = get_city_coordinates(tenant_id)
+            if coords:
+                lat, lon = coords["lat"], coords["lon"]
+                logger.info(f"✅ Coordinates found from message extraction: {lat}, {lon}")
+    # Final check: if still no coordinates, return error
+    if lat is None or lon is None:
+        logger.warning(f"❌ No coordinates available for weather request (tenant_id: {tenant_id})")
+        return OrchestrationResult(
+            intent=IntentType.WEATHER.value,
+            reply=(
+                "I need to know your location to check the weather! 📍 "
+                "You can tell me your city, or share your location."
+            ),
+            success=False,
+            error="Location required"
+        )
+    try:
+        # Use combined weather + events if compound intent detected
+        if is_compound and tenant_id and EVENT_WEATHER_AVAILABLE:
+            logger.info("Using weather+events combined handler")
+            result = await get_event_recommendations_with_weather(tenant_id, lat, lon)
+            # Build response
+            weather = result.get("weather", {})
+            weather_summary = result.get("weather_summary", "Weather unavailable")
+            suggestions = result.get("suggestions", [])
+            reply_lines = [f"🌤️ **Weather Update:**\n{weather_summary}\n"]
+            if suggestions:
+                reply_lines.append("\n📅 **Event Suggestions Based on Weather:**")
+                for suggestion in suggestions[:5]:  # Top 5 suggestions
+                    reply_lines.append(f"• {suggestion}")
+            reply = "\n".join(reply_lines)
+            return OrchestrationResult(
+                intent=IntentType.WEATHER.value,
+                reply=reply,
+                success=True,
+                data=result,
+                model_id="weather_events_combined"
+            )
+        else:
+            # Simple weather query using enhanced weather_agent
+            weather = await get_weather_for_location(lat, lon)
+            # Use enhanced weather_agent's format_weather_summary
+            if format_weather_summary:
+                weather_text = format_weather_summary(weather)
+            else:
+                # Fallback formatting
+                temp = weather.get("temperature", {}).get("value")
+                phrase = weather.get("phrase", "Conditions unavailable")
+                if temp:
+                    weather_text = f"{phrase}, {int(temp)}°F"
+                else:
+                    weather_text = phrase
+            # Get outfit recommendation from enhanced weather_agent
+            if recommend_outfit:
+                temp = weather.get("temperature", {}).get("value", 70)
+                condition = weather.get("phrase", "Clear")
+                outfit = recommend_outfit(temp, condition)
+                reply = f"🌤️ {weather_text}\n\n👕 {outfit}"
+            else:
+                reply = f"🌤️ {weather_text}"
+            return OrchestrationResult(
+                intent=IntentType.WEATHER.value,
+                reply=reply,
+                success=True,
+                data=weather,
+                model_id="azure-maps-weather"
+            )
+    except Exception as e:
+        logger.error(f"Weather error: {e}", exc_info=True)
+        return OrchestrationResult(
+            intent=IntentType.WEATHER.value,
+            reply=(
+                "I'm having trouble getting weather data right now. "
+                "Can I help you with something else? 💛"
+            ),
+            success=False,
+            error=str(e),
+            fallback_used=True
+        )
+async def _handle_events(
+    message: str,
+    context: Dict[str, Any],
+    tenant_id: Optional[str],
+    lat: Optional[float],
+    lon: Optional[float],
+    intent_result: IntentMatch
+) -> OrchestrationResult:
+    """
+    📅 Events handler.
+    Routes event queries to tool_agent with proper error handling
+    and graceful degradation.
+    """
+    logger.info("📅 Processing events request")
+    if not tenant_id:
+        return OrchestrationResult(
+            intent=IntentType.EVENTS.value,
+            reply=(
+                "I'd love to help you find events! 📅 "
+                "Which city are you interested in? "
+                "I have information for Atlanta, Birmingham, Chesterfield, "
+                "El Paso, Providence, and Seattle."
+            ),
+            success=False,
+            error="City required"
+        )
+    # Check tool agent availability
+    if not TOOL_AGENT_AVAILABLE:
+        logger.warning("Tool agent not available")
+        return OrchestrationResult(
+            intent=IntentType.EVENTS.value,
+            reply=(
+                "Event information isn't available right now. "
+                "Try again soon! 📅"
+            ),
+            success=False,
+            error="Tool agent not loaded",
+            fallback_used=True
+        )
+    try:
+        # FIXED: Add role parameter (compatibility fix)
+        tool_response = await handle_tool_request(
+            user_input=message,
+            role=context.get("role", "resident"),  # ← ADDED
+            lat=lat,
+            lon=lon,
+            context=context
+        )
+        reply = tool_response.get("response", "Events information retrieved.")
+        return OrchestrationResult(
+            intent=IntentType.EVENTS.value,
+            reply=reply,
+            success=True,
+            data=tool_response,
+            model_id="events_tool"
+        )
+    except Exception as e:
+        logger.error(f"Events error: {e}", exc_info=True)
+        return OrchestrationResult(
+            intent=IntentType.EVENTS.value,
+            reply=(
+                "I'm having trouble loading event information right now. "
+                "Check back soon! 📅"
+            ),
+            success=False,
+            error=str(e),
+            fallback_used=True
+        )
+async def _handle_local_resources(
+    message: str,
+    context: Dict[str, Any],
+    tenant_id: Optional[str],
+    lat: Optional[float],
+    lon: Optional[float]
+) -> OrchestrationResult:
+    """
+    🏛️ Local resources handler (shelters, libraries, food banks, etc.).
+    Routes resource queries to tool_agent with proper error handling.
+    """
+    logger.info("🏛️ Processing local resources request")
+    if not tenant_id:
+        return OrchestrationResult(
+            intent=IntentType.LOCAL_RESOURCES.value,
+            reply=(
+                "I can help you find local resources! 🏛️ "
+                "Which city do you need help in? "
+                "I cover Atlanta, Birmingham, Chesterfield, El Paso, "
+                "Providence, and Seattle."
+            ),
+            success=False,
+            error="City required"
+        )
+    # Check tool agent availability
+    if not TOOL_AGENT_AVAILABLE:
+        logger.warning("Tool agent not available")
+        return OrchestrationResult(
+            intent=IntentType.LOCAL_RESOURCES.value,
+            reply=(
+                "Resource information isn't available right now. "
+                "Try again soon! 🏛️"
+            ),
+            success=False,
+            error="Tool agent not loaded",
+            fallback_used=True
+        )
+    try:
+        # FIXED: Add role parameter (compatibility fix)
+        tool_response = await handle_tool_request(
+            user_input=message,
+            role=context.get("role", "resident"),  # ← ADDED
+            lat=lat,
+            lon=lon,
+            context=context
+        )
+        reply = tool_response.get("response", "Resource information retrieved.")
+        return OrchestrationResult(
+            intent=IntentType.LOCAL_RESOURCES.value,
+            reply=reply,
+            success=True,
+            data=tool_response,
+            model_id="resources_tool"
+        )
+    except Exception as e:
+        logger.error(f"Resources error: {e}", exc_info=True)
+        return OrchestrationResult(
+            intent=IntentType.LOCAL_RESOURCES.value,
+            reply=(
+                "I'm having trouble finding resource information right now. "
+                "Would you like to try a different search? 💛"
+            ),
+            success=False,
+            error=str(e),
+            fallback_used=True
+        )
+async def _handle_conversational(
+    message: str,
+    intent: IntentType,
+    context: Dict[str, Any]
+) -> OrchestrationResult:
+    """
+    💬 Handles conversational intents (greeting, help, unknown).
+    Uses Penny's core LLM for natural responses with graceful fallback.
+    """
+    logger.info(f"💬 Processing conversational intent: {intent.value}")
+    # Check LLM availability
+    use_llm = LLM_AVAILABLE
+    try:
+        if use_llm:
+            # Build prompt based on intent
+            if intent == IntentType.GREETING:
+                prompt = (
+                    f"The user greeted you with: '{message}'\n\n"
+                    "Respond warmly as Penny, introduce yourself briefly, "
+                    "and ask how you can help them with civic services today."
+                )
+            elif intent == IntentType.HELP:
+                prompt = (
+                    f"The user asked for help: '{message}'\n\n"
+                    "Explain Penny's main features:\n"
+                    "- Finding local resources (shelters, libraries, food banks)\n"
+                    "- Community events and activities\n"
+                    "- Weather information\n"
+                    "- 27-language translation\n"
+                    "- Document processing help\n\n"
+                    "Ask which city they need assistance in."
+                )
+            else:  # UNKNOWN
+                prompt = (
+                    f"The user said: '{message}'\n\n"
+                    "You're not sure what they need help with. "
+                    "Respond kindly, acknowledge their request, and ask them to "
+                    "clarify or rephrase. Mention a few things you can help with."
+                )
+            # Call Penny's core LLM
+            llm_result = await generate_response(prompt=prompt, max_new_tokens=200)
+            # Use compatibility helper to check result
+            success, error = _check_result_success(llm_result, ["response"])
+            if success:
+                reply = llm_result.get("response", "")
+                return OrchestrationResult(
+                    intent=intent.value,
+                    reply=reply,
+                    success=True,
+                    data=llm_result,
+                    model_id=CORE_MODEL_ID
+                )
+            else:
+                raise Exception(error or "LLM generation failed")
+        else:
+            # LLM not available, use fallback directly
+            logger.info("LLM not available, using fallback responses")
+            raise Exception("LLM service not loaded")
+    except Exception as e:
+        logger.warning(f"Conversational handler using fallback: {e}")
+        # Hardcoded fallback responses (Penny's friendly voice)
+        fallback_replies = {
+            IntentType.GREETING: (
+                "Hi there! 👋 I'm Penny, your civic assistant. "
+                "I can help you find local resources, events, weather, and more. "
+                "What city are you in?"
+            ),
+            IntentType.HELP: (
+                "I'm Penny! 💛 I can help you with:\n\n"
+                "🏛️ Local resources (shelters, libraries, food banks)\n"
+                "📅 Community events\n"
+                "🌤️ Weather updates\n"
+                "🌍 Translation (27 languages)\n"
+                "📄 Document help\n\n"
+                "What would you like to know about?"
+            ),
+            IntentType.UNKNOWN: (
+                "I'm not sure I understood that. Could you rephrase? "
+                "I'm best at helping with local services, events, weather, "
+                "and translation! 💬"
+            )
+        }
+        return OrchestrationResult(
+            intent=intent.value,
+            reply=fallback_replies.get(intent, "How can I help you today? 💛"),
+            success=True,
+            model_id="fallback",
+            fallback_used=True
+        )
+async def _handle_fallback(
+    message: str,
+    intent: IntentType,
+    context: Dict[str, Any]
+) -> OrchestrationResult:
+    """
+    🆘 Ultimate fallback handler for unhandled intents.
+    This is a safety net that should rarely trigger, but ensures
+    users always get a helpful response.
+    """
+    logger.warning(f"⚠️ Fallback triggered for intent: {intent.value}")
+    reply = (
+        "I've processed your request, but I'm not sure how to help with that yet. "
+        "I'm still learning! 🤖\n\n"
+        "I'm best at:\n"
+        "🏛️ Finding local resources\n"
+        "📅 Community events\n"
+        "🌤️ Weather updates\n"
+        "🌍 Translation\n\n"
+        "Could you rephrase your question? 💛"
+    )
+    return OrchestrationResult(
+        intent=intent.value,
+        reply=reply,
+        success=False,
+        error="Unhandled intent",
+        fallback_used=True
+    )
+# ============================================================
+# HEALTH CHECK & DIAGNOSTICS (ENHANCED)
+# ============================================================
+def get_orchestrator_health() -> Dict[str, Any]:
+    """
+    📊 Returns comprehensive orchestrator health status.
+    Used by the main application health check endpoint to monitor
+    the orchestrator and all its service dependencies.
+    Returns:
+        Dictionary with health information including:
+        - status: operational/degraded
+        - service_availability: which services are loaded
+        - statistics: orchestration counts
+        - supported_intents: list of all intent types
+        - features: available orchestrator features
+    """
+    # Get service availability
+    services = get_service_availability()
+    # Determine overall status
+    # Orchestrator is operational even if some services are down (graceful degradation)
+    critical_services = ["weather", "tool_agent"]  # Must have these
+    critical_available = all(services.get(svc, False) for svc in critical_services)
+    status = "operational" if critical_available else "degraded"
+    return {
+        "status": status,
+        "core_model": CORE_MODEL_ID,
+        "max_response_time_ms": MAX_RESPONSE_TIME_MS,
+        "statistics": {
+            "total_orchestrations": _orchestration_count,
+            "emergency_interactions": _emergency_count
+        },
+        "service_availability": services,
+        "supported_intents": [intent.value for intent in IntentType],
+        "features": {
+            "emergency_routing": True,
+            "compound_intents": True,
+            "fallback_handling": True,
+            "performance_tracking": True,
+            "context_aware": True,
+            "multi_language": TRANSLATION_AVAILABLE,
+            "sentiment_analysis": SENTIMENT_AVAILABLE,
+            "bias_detection": BIAS_AVAILABLE,
+            "weather_integration": WEATHER_AGENT_AVAILABLE,
+            "event_recommendations": EVENT_WEATHER_AVAILABLE
+        }
+    }
+def get_orchestrator_stats() -> Dict[str, Any]:
+    """
+    📈 Returns orchestrator statistics.
+    Useful for monitoring and analytics.
+    """
+    return {
+        "total_orchestrations": _orchestration_count,
+        "emergency_interactions": _emergency_count,
+        "services_available": sum(1 for v in get_service_availability().values() if v),
+        "services_total": len(get_service_availability())
+    }
+# ============================================================
+# TESTING & DEBUGGING (ENHANCED)
+# ============================================================
+if __name__ == "__main__":
+    """
+    🧪 Test the orchestrator with sample queries.
+    Run with: python -m app.orchestrator
+    """
+    import asyncio
+    print("=" * 60)
+    print("🧪 Testing Penny's Orchestrator")
+    print("=" * 60)
+    # Display service availability first
+    print("\n📊 Service Availability Check:")
+    services = get_service_availability()
+    for service, available in services.items():
+        status = "✅" if available else "❌"
+        print(f"  {status} {service}: {'Available' if available else 'Not loaded'}")
+    print("\n" + "=" * 60)
+    test_queries = [
+        {
+            "name": "Greeting",
+            "message": "Hi Penny!",
+            "context": {}
+        },
+        {
+            "name": "Weather with location",
+            "message": "What's the weather?",
+            "context": {"lat": 33.7490, "lon": -84.3880}
+        },
+        {
+            "name": "Events in city",
+            "message": "Events in Atlanta",
+            "context": {"tenant_id": "atlanta_ga"}
+        },
+        {
+            "name": "Help request",
+            "message": "I need help",
+            "context": {}
+        },
+        {
+            "name": "Translation",
+            "message": "Translate hello",
+            "context": {"source_lang": "eng_Latn", "target_lang": "spa_Latn"}
+        }
+    ]
+    async def run_tests():
+        for i, query in enumerate(test_queries, 1):
+            print(f"\n--- Test {i}: {query['name']} ---")
+            print(f"Query: {query['message']}")
+            try:
+                result = await run_orchestrator(query["message"], query["context"])
+                print(f"Intent: {result['intent']}")
+                print(f"Success: {result['success']}")
+                print(f"Fallback: {result.get('fallback_used', False)}")
+                # Truncate long replies
+                reply = result['reply']
+                if len(reply) > 150:
+                    reply = reply[:150] + "..."
+                print(f"Reply: {reply}")
+                if result.get('response_time_ms'):
+                    print(f"Response time: {result['response_time_ms']:.0f}ms")
+            except Exception as e:
+                print(f"❌ Error: {e}")
+    asyncio.run(run_tests())
+    print("\n" + "=" * 60)
+    print("📊 Final Statistics:")
+    stats = get_orchestrator_stats()
+    for key, value in stats.items():
+        print(f"  {key}: {value}")
+    print("\n" + "=" * 60)
+    print("✅ Tests complete")
+    print("=" * 60)