Spaces:

MuhammadMahmoud
/

Aoun-Ai

Sleeping

App Files Files Community

MuhammadMahmoud commited on 14 days ago

Commit

c1d5b1b

1 Parent(s): 1c16632

fix dashboard isues

Browse files

Files changed (8) hide show

app/services/chat/api/gemini_client.py +17 -1
app/services/chat/api/groq_client.py +16 -0
app/services/chat/api/llm_error_classifier.py +107 -0
app/services/chat/api/model_validator.py +79 -25
app/services/chat/api/openai_client.py +16 -0
app/services/chat/api/openrouter_client.py +17 -0
app/static/dashboard.html +13 -4
docker-compose.yml +37 -2

app/services/chat/api/gemini_client.py CHANGED Viewed

@@ -66,21 +66,37 @@ class GeminiClient:
         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             body = getattr(e, "body", None)
             delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
-            if "quota" in error_msg or "exhausted" in error_msg:
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
                 return ErrorType.RATE_LIMITED, delay or 15
         if status_code == 404 or "not found" in error_msg:
             return ErrorType.MODEL_NOT_FOUND, 0
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
                 return ErrorType.MODEL_DECOMMISSIONED, 0
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
+        # ─── 5xx Server Errors (502, 503, etc) ───
+        if status_code and status_code >= 500 and status_code < 600:
+            logger.error(f"🚫 Gemini 5xx error (HTTP {status_code}): provider infrastructure issue")
+            if status_code in (502, 503):
+                return ErrorType.PERMANENT_FAILURE, 300  # Retry after 5 minutes
+            return ErrorType.PERMANENT_FAILURE, 0
+        # ─── Rate Limiting (429) ───
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             body = getattr(e, "body", None)
             delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
+            if "quota" in error_msg or "exhausted" in error_msg or "resource exhausted" in error_msg:
+                logger.warning(f"💰 Gemini quota exhausted (429): {error_msg[:80]}")
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
+                logger.warning(f"⏱️ Gemini rate limited (429): delaying {delay}s")
                 return ErrorType.RATE_LIMITED, delay or 15
+        # ─── Not Found (404) ───
         if status_code == 404 or "not found" in error_msg:
+            logger.error(f"❌ Gemini model not found (404)")
             return ErrorType.MODEL_NOT_FOUND, 0
+        # ─── Bad Request (400) ───
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
+                logger.error(f"❌ Gemini model decommissioned (400)")
                 return ErrorType.MODEL_DECOMMISSIONED, 0
+        # ─── Default: Unclassified ───
+        logger.warning(f"⚠️ Gemini unclassified error (HTTP {status_code}): {error_msg[:80]}")
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

app/services/chat/api/groq_client.py CHANGED Viewed

@@ -66,20 +66,36 @@ class GroqClient:
         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             delay = parse_retry_after(getattr(e, "message", str(e)))
             if "tpd" in error_msg or "tokens per day" in error_msg or "daily" in error_msg or "insufficient_quota" in error_msg:
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
                 return ErrorType.RATE_LIMITED, delay or 30
         if status_code == 404 or "not found" in error_msg:
             return ErrorType.MODEL_NOT_FOUND, 0
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
                 return ErrorType.MODEL_DECOMMISSIONED, 0
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
+        # ─── 5xx Server Errors (502, 503, etc) ───
+        if status_code and status_code >= 500 and status_code < 600:
+            logger.error(f"🚫 Groq 5xx error (HTTP {status_code}): provider infrastructure issue")
+            if status_code in (502, 503):
+                return ErrorType.PERMANENT_FAILURE, 300  # Retry after 5 minutes
+            return ErrorType.PERMANENT_FAILURE, 0
+        # ─── Rate Limiting (429) ───
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             delay = parse_retry_after(getattr(e, "message", str(e)))
             if "tpd" in error_msg or "tokens per day" in error_msg or "daily" in error_msg or "insufficient_quota" in error_msg:
+                logger.warning(f"💰 Groq quota exhausted (429): {error_msg[:80]}")
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
+                logger.warning(f"⏱️ Groq rate limited (429): delaying {delay}s")
                 return ErrorType.RATE_LIMITED, delay or 30
+        # ─── Not Found (404) ───
         if status_code == 404 or "not found" in error_msg:
+            logger.error(f"❌ Groq model not found (404)")
             return ErrorType.MODEL_NOT_FOUND, 0
+        # ─── Bad Request (400) ───
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
+                logger.error(f"❌ Groq model decommissioned (400)")
                 return ErrorType.MODEL_DECOMMISSIONED, 0
+        # ─── Default: Unclassified ───
+        logger.warning(f"⚠️ Groq unclassified error (HTTP {status_code}): {error_msg[:80]}")
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

app/services/chat/api/llm_error_classifier.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Unified LLM error classification logic shared across all provider clients.
+Standardizes error handling and reduces code duplication.
+"""
+import logging
+from typing import Tuple
+from app.services.chat.api.llm_errors import ErrorType
+from app.services.chat.api.retry_parser import parse_retry_after
+logger = logging.getLogger(__name__)
+def classify_llm_error(e: Exception, provider_name: str = "Unknown") -> Tuple[ErrorType, int]:
+    """
+    Centralized error classification for all LLM providers.
+    Returns:
+        Tuple[ErrorType, int]: (error_type, retry_after_seconds)
+    """
+    error_msg = str(e).lower()
+    status_code = getattr(e, "status_code", None)
+    body = getattr(e, "body", None)
+    # ─── 5xx Server Errors → PERMANENT_FAILURE (provider is broken) ───
+    if status_code and status_code >= 500 and status_code < 600:
+        logger.error(f"[{provider_name}] 5xx error (HTTP {status_code}): likely infrastructure issue")
+        # Don't immediately retry 5xx — likely indicates provider issues
+        if status_code == 503:  # Service Unavailable
+            return ErrorType.PERMANENT_FAILURE, 300  # Retry after 5 minutes
+        else:
+            return ErrorType.PERMANENT_FAILURE, 0  # Don't auto-retry
+    # ─── Rate Limiting (429) ───
+    if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
+        delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
+        # Distinguish between quota exhaustion and rate limit
+        if ("insufficient_quota" in error_msg or
+            "quota" in error_msg or
+            "balance" in error_msg or
+            "tpd" in error_msg or  # Tokens per day (Groq)
+            "tokens per day" in error_msg or
+            "daily" in error_msg):
+            logger.warning(f"[{provider_name}] Quota exhausted (429): {error_msg[:100]}")
+            return ErrorType.QUOTA_EXHAUSTED, delay or 3600
+        else:
+            logger.warning(f"[{provider_name}] Rate limited (429): delaying {delay}s")
+            return ErrorType.RATE_LIMITED, delay or 30
+    # ─── Model Not Found (404) ───
+    if status_code == 404 or "not found" in error_msg or "model does not exist" in error_msg:
+        logger.error(f"[{provider_name}] Model not found (404)")
+        return ErrorType.MODEL_NOT_FOUND, 0
+    # ─── Bad Request (400) ───
+    if status_code == 400 or "bad request" in error_msg:
+        if "decommissioned" in error_msg or "offline" in error_msg or "unavailable" in error_msg:
+            logger.error(f"[{provider_name}] Model decommissioned/offline (400)")
+            return ErrorType.MODEL_DECOMMISSIONED, 0
+        else:
+            logger.error(f"[{provider_name}] Bad request (400): {error_msg[:100]}")
+            return ErrorType.PERMANENT_FAILURE, 0
+    # ─── Timeout (assumed permanent if not retryable) ───
+    if "timeout" in error_msg or "timed out" in error_msg:
+        logger.warning(f"[{provider_name}] Timeout error: {error_msg[:100]}")
+        return ErrorType.PERMANENT_FAILURE, 60  # Retry after 1 minute
+    # ─── Default: Unclassified error ───
+    logger.warning(f"[{provider_name}] Unclassified error: {error_msg[:100]}")
+    return ErrorType.PERMANENT_FAILURE, 0
+def should_disable_model_permanently(error_type: ErrorType) -> bool:
+    """Determine if a model should be permanently disabled based on error type."""
+    return error_type in (
+        ErrorType.MODEL_NOT_FOUND,
+        ErrorType.MODEL_DECOMMISSIONED,
+    )
+def should_retry_provider(error_type: ErrorType, attempt_count: int = 0) -> bool:
+    """
+    Determine if we should retry the current provider or fail over.
+    Args:
+        error_type: Type of error encountered
+        attempt_count: Number of attempts so far
+    Returns:
+        bool: True if should retry this provider, False if should failover
+    """
+    # Transient errors might recover with retry
+    if error_type == ErrorType.RATE_LIMITED and attempt_count < 2:
+        return True
+    # Quota exhaustion requires failover to another provider
+    if error_type == ErrorType.QUOTA_EXHAUSTED:
+        return False
+    # Permanent errors should not be retried on same provider
+    if error_type in (ErrorType.PERMANENT_FAILURE, ErrorType.MODEL_NOT_FOUND, ErrorType.MODEL_DECOMMISSIONED):
+        return False
+    return False

app/services/chat/api/model_validator.py CHANGED Viewed

@@ -3,9 +3,11 @@ Model Validator — Startup and periodic health probes for LLM models.
 Features:
   - Lightweight 1-token probes (max_tokens=1)
-  - Probe result caching (60s TTL) — healthy models are not re-probed
-  - Staggered probes (200ms between requests) to avoid burst quota hits
   - Separate fast path for periodic revalidation (only probes unhealthy models)
 Usage:
     Called automatically from FastAPI lifespan startup and from the
@@ -21,8 +23,10 @@ logger = logging.getLogger(__name__)
 # Probe result cache: { "Provider/Model": (timestamp, status) }
 _probe_cache: dict[str, tuple[float, str]] = {}
-_CACHE_TTL = 60  # seconds
-_PROBE_STAGGER = 0.2  # seconds between probes to avoid bursts
 def _is_cached_healthy(provider: str, model: str) -> bool:
@@ -44,68 +48,118 @@ def _cache_result(provider: str, model: str, status: str):
 async def validate_models_at_startup(revalidation_only: bool = False):
     """
-    Probes each model with a 1-token request.
     Disables dead models BEFORE serving traffic.
     Args:
         revalidation_only: If True, only probes models that are NOT cached healthy.
                            Used by periodic background revalidation to save quota.
     """
     from app.services.chat.api.llm_router import llm_router
-    results = {"alive": [], "dead": [], "error": []}
     probe_count = 0
-    for provider_name, client in llm_router.providers:
         if not getattr(client, "client", None):
             continue
         if hasattr(client, "refresh_dynamic_models"):
-            await client.refresh_dynamic_models()
         models = list(client._get_all_models())  # ALL models, not just active
-        for model in models:
             # Skip recently-verified healthy models during revalidation
             if revalidation_only and _is_cached_healthy(provider_name, model):
                 results["alive"].append(f"{provider_name}/{model}")
                 continue
-            # Stagger probes to avoid burst rate-limit hits
-            if probe_count > 0:
-                await asyncio.sleep(_PROBE_STAGGER)
             probe_count += 1
             try:
-                await client.client.chat.completions.create(
-                    model=model,
-                    messages=[{"role": "user", "content": "hi"}],
-                    max_tokens=1,
-                    timeout=10.0,
                 )
                 results["alive"].append(f"{provider_name}/{model}")
                 _cache_result(provider_name, model, "alive")
                 logger.info(f"✅ {provider_name}/{model} — alive")
             except Exception as e:
-                err_type, _ = client.classify_error(e)
                 if err_type in (ErrorType.MODEL_NOT_FOUND, ErrorType.MODEL_DECOMMISSIONED, ErrorType.PERMANENT_FAILURE):
                     client._permanently_disabled.add(model)
                     results["dead"].append(f"{provider_name}/{model}")
                     _cache_result(provider_name, model, "dead")
-                    logger.error(f"❌ {provider_name}/{model} — DEAD, disabled")
                 elif err_type == ErrorType.QUOTA_EXHAUSTED:
                     results["error"].append(f"{provider_name}/{model} (quota)")
                     # Don't cache quota errors — they're transient
-                    logger.warning(f"⚠️ {provider_name}/{model} — quota issue, keeping enabled")
                 else:
-                    results["error"].append(f"{provider_name}/{model} ({e})")
                     logger.warning(f"⚠️ {provider_name}/{model} — probe error: {e}")
     mode = "revalidation" if revalidation_only else "startup"
     logger.info(
-        f"Model validation complete ({mode}): "
         f"{len(results['alive'])} alive, "
         f"{len(results['dead'])} dead, "
         f"{len(results['error'])} warnings, "
         f"{probe_count} probes sent"
     )
     return results

 Features:
   - Lightweight 1-token probes (max_tokens=1)
+  - Aggressive result caching (300s TTL) — minimize redundant probes
+  - Intelligent staggering (500ms between providers, 200ms within provider)
   - Separate fast path for periodic revalidation (only probes unhealthy models)
+  - Per-provider sequential probing to avoid burst quota hits
+  - Automatic provider skip after N consecutive failures
 Usage:
     Called automatically from FastAPI lifespan startup and from the
 # Probe result cache: { "Provider/Model": (timestamp, status) }
 _probe_cache: dict[str, tuple[float, str]] = {}
+_CACHE_TTL = 300  # seconds (5 minutes - aggressive caching to avoid quota hits)
+_PROBE_STAGGER_WITHIN = 0.2  # seconds between models within same provider
+_PROBE_STAGGER_BETWEEN = 0.5  # seconds between providers
+_MAX_CONSECUTIVE_FAILURES = 5  # Skip provider if N models fail in a row
 def _is_cached_healthy(provider: str, model: str) -> bool:
 async def validate_models_at_startup(revalidation_only: bool = False):
     """
+    Intelligently probes each model with sequential per-provider strategy.
     Disables dead models BEFORE serving traffic.
+    Strategy:
+    1. Group models by provider
+    2. For each provider, probe models sequentially with within-provider stagger
+    3. Skip provider after N consecutive failures (likely rate-limited)
+    4. Aggressively cache results to avoid re-probing
     Args:
         revalidation_only: If True, only probes models that are NOT cached healthy.
                            Used by periodic background revalidation to save quota.
     """
     from app.services.chat.api.llm_router import llm_router
+    results = {"alive": [], "dead": [], "error": [], "skipped": []}
     probe_count = 0
+    for provider_idx, (provider_name, client) in enumerate(llm_router.providers):
         if not getattr(client, "client", None):
+            logger.warning(f"⏭️ Skipping {provider_name} — no client")
             continue
         if hasattr(client, "refresh_dynamic_models"):
+            try:
+                await client.refresh_dynamic_models()
+            except Exception as e:
+                logger.warning(f"Failed to refresh {provider_name} models: {e}")
         models = list(client._get_all_models())  # ALL models, not just active
+        if not models:
+            logger.warning(f"⏭️ Skipping {provider_name} — no models found")
+            continue
+        logger.info(f"🔍 Probing {provider_name} ({len(models)} models)...")
+        # Stagger between providers to avoid burst cross-provider
+        if provider_idx > 0:
+            await asyncio.sleep(_PROBE_STAGGER_BETWEEN)
+        consecutive_failures = 0
+        for model_idx, model in enumerate(models):
             # Skip recently-verified healthy models during revalidation
             if revalidation_only and _is_cached_healthy(provider_name, model):
                 results["alive"].append(f"{provider_name}/{model}")
+                consecutive_failures = 0
                 continue
+            # Skip provider if too many consecutive failures (likely quota crisis)
+            if consecutive_failures >= _MAX_CONSECUTIVE_FAILURES:
+                reason = f"Provider {provider_name} — {_MAX_CONSECUTIVE_FAILURES} consecutive failures, skipping remaining models"
+                logger.error(f"🚫 {reason}")
+                remaining = len(models) - model_idx
+                for remaining_model in models[model_idx:]:
+                    results["skipped"].append(f"{provider_name}/{remaining_model}")
+                break
+            # Stagger within provider to avoid burst
+            if model_idx > 0:
+                await asyncio.sleep(_PROBE_STAGGER_WITHIN)
             probe_count += 1
             try:
+                await asyncio.wait_for(
+                    client.client.chat.completions.create(
+                        model=model,
+                        messages=[{"role": "user", "content": "hi"}],
+                        max_tokens=1,
+                        temperature=0.5,
+                    ),
+                    timeout=10.0
                 )
                 results["alive"].append(f"{provider_name}/{model}")
                 _cache_result(provider_name, model, "alive")
+                consecutive_failures = 0
                 logger.info(f"✅ {provider_name}/{model} — alive")
+            except asyncio.TimeoutError:
+                results["error"].append(f"{provider_name}/{model} (timeout)")
+                consecutive_failures += 1
+                logger.warning(f"⏱️ {provider_name}/{model} — timeout, probe error")
             except Exception as e:
+                err_type, retry_after = client.classify_error(e)
                 if err_type in (ErrorType.MODEL_NOT_FOUND, ErrorType.MODEL_DECOMMISSIONED, ErrorType.PERMANENT_FAILURE):
                     client._permanently_disabled.add(model)
                     results["dead"].append(f"{provider_name}/{model}")
                     _cache_result(provider_name, model, "dead")
+                    consecutive_failures += 1
+                    logger.error(f"❌ {provider_name}/{model} — DEAD, permanently disabled")
                 elif err_type == ErrorType.QUOTA_EXHAUSTED:
                     results["error"].append(f"{provider_name}/{model} (quota)")
+                    consecutive_failures += 1
                     # Don't cache quota errors — they're transient
+                    logger.warning(f"⚠️ {provider_name}/{model} — quota issue (may recover), keeping enabled")
                 else:
+                    results["error"].append(f"{provider_name}/{model} ({err_type.name})")
+                    consecutive_failures += 1
                     logger.warning(f"⚠️ {provider_name}/{model} — probe error: {e}")
     mode = "revalidation" if revalidation_only else "startup"
     logger.info(
+        f"✅ Model validation complete ({mode}): "
         f"{len(results['alive'])} alive, "
         f"{len(results['dead'])} dead, "
         f"{len(results['error'])} warnings, "
+        f"{len(results['skipped'])} skipped, "
         f"{probe_count} probes sent"
     )
     return results

app/services/chat/api/openai_client.py CHANGED Viewed

@@ -66,21 +66,37 @@ class OpenAIClient:
         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             body = getattr(e, "body", None)
             delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
             if "insufficient_quota" in error_msg or "quota" in error_msg or "exhausted" in error_msg:
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
                 return ErrorType.RATE_LIMITED, delay or 15
         if status_code == 404 or "not found" in error_msg:
             return ErrorType.MODEL_NOT_FOUND, 0
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
                 return ErrorType.MODEL_DECOMMISSIONED, 0
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
+        # ─── 5xx Server Errors (502, 503, etc) ───
+        if status_code and status_code >= 500 and status_code < 600:
+            logger.error(f"🚫 OpenAI 5xx error (HTTP {status_code}): provider infrastructure issue")
+            if status_code in (502, 503):
+                return ErrorType.PERMANENT_FAILURE, 300  # Retry after 5 minutes
+            return ErrorType.PERMANENT_FAILURE, 0
+        # ─── Rate Limiting (429) ───
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             body = getattr(e, "body", None)
             delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
             if "insufficient_quota" in error_msg or "quota" in error_msg or "exhausted" in error_msg:
+                logger.warning(f"💰 OpenAI quota exhausted (429): {error_msg[:80]}")
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
+                logger.warning(f"⏱️ OpenAI rate limited (429): delaying {delay}s")
                 return ErrorType.RATE_LIMITED, delay or 15
+        # ─── Not Found (404) ───
         if status_code == 404 or "not found" in error_msg:
+            logger.error(f"❌ OpenAI model not found (404)")
             return ErrorType.MODEL_NOT_FOUND, 0
+        # ─── Bad Request (400) ───
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
+                logger.error(f"❌ OpenAI model decommissioned (400)")
                 return ErrorType.MODEL_DECOMMISSIONED, 0
+        # ─── Default: Unclassified ───
+        logger.warning(f"⚠️ OpenAI unclassified error (HTTP {status_code}): {error_msg[:80]}")
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

app/services/chat/api/openrouter_client.py CHANGED Viewed

@@ -112,21 +112,38 @@ class OpenRouterClient:
         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             body = getattr(e, "body", None)
             delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
             if "insufficient_quota" in error_msg or "quota" in error_msg or "balance" in error_msg:
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
                 return ErrorType.RATE_LIMITED, delay or 15
         if status_code == 404 or "not found" in error_msg:
             return ErrorType.MODEL_NOT_FOUND, 0
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
                 return ErrorType.MODEL_DECOMMISSIONED, 0
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

         error_msg = str(e).lower()
         status_code = getattr(e, "status_code", None)
+        # ─── 5xx Server Errors (502, 503, etc) ───
+        if status_code and status_code >= 500 and status_code < 600:
+            logger.error(f"🚫 OpenRouter 5xx error (HTTP {status_code}): provider infrastructure issue")
+            # 502/503 = provider is down, likely transient
+            if status_code in (502, 503):
+                return ErrorType.PERMANENT_FAILURE, 300  # Retry after 5 minutes
+            return ErrorType.PERMANENT_FAILURE, 0
+        # ─── Rate Limiting (429) ───
         if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
             body = getattr(e, "body", None)
             delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
             if "insufficient_quota" in error_msg or "quota" in error_msg or "balance" in error_msg:
+                logger.warning(f"💰 OpenRouter quota exhausted (429): {error_msg[:80]}")
                 return ErrorType.QUOTA_EXHAUSTED, delay or 3600
             else:
+                logger.warning(f"⏱️ OpenRouter rate limited (429): delaying {delay}s")
                 return ErrorType.RATE_LIMITED, delay or 15
+        # ─── Not Found (404) ───
         if status_code == 404 or "not found" in error_msg:
+            logger.error(f"❌ OpenRouter model not found (404)")
             return ErrorType.MODEL_NOT_FOUND, 0
+        # ─── Bad Request (400) ───
         if status_code == 400 or "bad request" in error_msg:
             if "decommissioned" in error_msg or "offline" in error_msg:
+                logger.error(f"❌ OpenRouter model decommissioned (400)")
                 return ErrorType.MODEL_DECOMMISSIONED, 0
+        # ─── Default: Unclassified ───
+        logger.warning(f"⚠️ OpenRouter unclassified error (HTTP {status_code}): {error_msg[:80]}")
         return ErrorType.PERMANENT_FAILURE, 0
     async def call_api(

app/static/dashboard.html CHANGED Viewed

@@ -413,7 +413,11 @@ function renderSys(sys,redis,cost){
 function renderProviders(providers,breakers){
   const tb=document.querySelector('#provTable tbody'); if(!tb)return; tb.innerHTML='';
   let sc=0,sr=0,act=0,cnt=0;
-  if(!providers) return;
   for(const[name,p]of Object.entries(providers)){
     cnt++;sc+=(p.score||0);sr+=(p.success_rate_window||0);
     const cb=(breakers&&breakers[name])||{};
@@ -455,13 +459,18 @@ function renderProviders(providers,breakers){
 // Render models
 function renderModels(models){
-  const tb=document.querySelector('#modTable tbody');tb.innerHTML='';
-  Object.entries(models||{}).sort((a,b)=>b[1].total_calls-a[1].total_calls).forEach(([name,m])=>{
     const ban=BANNED.includes(name),s=m.success_rate;
     const sc=s>=95?'t-ok':s>=70?'t-warn':'t-crit';
     tb.innerHTML+=`<tr style="opacity:${ban?.6:1}">
       <td><span class="tag" style="${ban?'text-decoration:line-through;color:var(--tm)':''}">${name}</span></td>
-      <td class="mono"><span class="t-ok">✓${m.total_calls-m.total_errors}</span> <span class="t-mute">|</span> <span class="t-crit">✗${m.total_errors}</span></td>
       <td class="mono ${sc}" style="font-weight:700">${s.toFixed(1)}%</td>
       <td class="mono">${m.avg_latency_ms}ms</td>
       <td>${ban?`<button class="btn btn-ok" onclick="adminCmd('POST','/api/ai/admin/model/unban',{model_name:'${name}'})">🔓 Unban</button>`:`<button class="btn btn-crit" onclick="adminCmd('POST','/api/ai/admin/model/ban',{model_name:'${name}'})">⛔ Ban</button>`}</td>

 function renderProviders(providers,breakers){
   const tb=document.querySelector('#provTable tbody'); if(!tb)return; tb.innerHTML='';
   let sc=0,sr=0,act=0,cnt=0;
+  if(!providers || Object.keys(providers).length === 0) {
+    document.getElementById('kProv').textContent = '0/0';
+    tb.innerHTML = '<tr><td colspan="6" style="text-align:center;color:var(--tm);padding:24px;font-style:italic">No active providers found in registry. Router is starting... 🔄</td></tr>';
+    return;
+  }
   for(const[name,p]of Object.entries(providers)){
     cnt++;sc+=(p.score||0);sr+=(p.success_rate_window||0);
     const cb=(breakers&&breakers[name])||{};
 // Render models
 function renderModels(models){
+  const tb=document.querySelector('#modTable tbody'); if(!tb) return; tb.innerHTML='';
+  const entries = Object.entries(models||{});
+  if(entries.length === 0){
+    tb.innerHTML = '<tr><td colspan="5" style="text-align:center;color:var(--tm);padding:24px;font-style:italic">No model usage recorded in memory yet. Awaiting chat traffic... ⏳</td></tr>';
+    return;
+  }
+  entries.sort((a,b)=>b[1].total_calls-a[1].total_calls).forEach(([name,m])=>{
     const ban=BANNED.includes(name),s=m.success_rate;
     const sc=s>=95?'t-ok':s>=70?'t-warn':'t-crit';
     tb.innerHTML+=`<tr style="opacity:${ban?.6:1}">
       <td><span class="tag" style="${ban?'text-decoration:line-through;color:var(--tm)':''}">${name}</span></td>
+      <td class="mono"><span class="t-ok">✓${Math.max(0, m.total_calls-m.total_errors)}</span> <span class="t-mute">|</span> <span class="t-crit">✗${m.total_errors}</span></td>
       <td class="mono ${sc}" style="font-weight:700">${s.toFixed(1)}%</td>
       <td class="mono">${m.avg_latency_ms}ms</td>
       <td>${ban?`<button class="btn btn-ok" onclick="adminCmd('POST','/api/ai/admin/model/unban',{model_name:'${name}'})">🔓 Unban</button>`:`<button class="btn btn-crit" onclick="adminCmd('POST','/api/ai/admin/model/ban',{model_name:'${name}'})">⛔ Ban</button>`}</td>

docker-compose.yml CHANGED Viewed

@@ -1,4 +1,5 @@
 services:
   qdrant:
     image: qdrant/qdrant:latest
     ports:
@@ -6,19 +7,53 @@ services:
     volumes:
       - qdrant_data:/qdrant/storage
     restart: unless-stopped
   app:
     build: .
     ports:
       - "7860:7860"
     depends_on:
-      - qdrant
     env_file:
       - ../.env
     environment:
-      # Override to use Docker network name instead of localhost
       - QDRANT_URL=http://qdrant:6333
     restart: unless-stopped
 volumes:
   qdrant_data:

 services:
+  # ─── Qdrant Vector Store ───
   qdrant:
     image: qdrant/qdrant:latest
     ports:
     volumes:
       - qdrant_data:/qdrant/storage
     restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:6333/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
+  # ─── Redis Cache (Session Storage & Rate Limit Tracking) ───
+  redis:
+    image: redis:7-alpine
+    ports:
+      - "6379:6379"
+    volumes:
+      - redis_data:/data
+    restart: unless-stopped
+    command: redis-server --appendonly yes --maxmemory 512mb --maxmemory-policy allkeys-lru
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
+  # ─── Main Application ───
   app:
     build: .
     ports:
       - "7860:7860"
     depends_on:
+      qdrant:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
     env_file:
       - ../.env
     environment:
+      # Use Docker network names for service discovery
       - QDRANT_URL=http://qdrant:6333
+      - REDIS_URL=redis://redis:6379/0
     restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7860/api/ai/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
 volumes:
   qdrant_data:
+  redis_data: