MuhammadMahmoud commited on
Commit
c1d5b1b
Β·
1 Parent(s): 1c16632

fix dashboard isues

Browse files
app/services/chat/api/gemini_client.py CHANGED
@@ -66,21 +66,37 @@ class GeminiClient:
66
  error_msg = str(e).lower()
67
  status_code = getattr(e, "status_code", None)
68
 
 
 
 
 
 
 
 
 
69
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
70
  body = getattr(e, "body", None)
71
  delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
72
- if "quota" in error_msg or "exhausted" in error_msg:
 
73
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
74
  else:
 
75
  return ErrorType.RATE_LIMITED, delay or 15
76
 
 
77
  if status_code == 404 or "not found" in error_msg:
 
78
  return ErrorType.MODEL_NOT_FOUND, 0
79
 
 
80
  if status_code == 400 or "bad request" in error_msg:
81
  if "decommissioned" in error_msg or "offline" in error_msg:
 
82
  return ErrorType.MODEL_DECOMMISSIONED, 0
83
 
 
 
84
  return ErrorType.PERMANENT_FAILURE, 0
85
 
86
  async def call_api(
 
66
  error_msg = str(e).lower()
67
  status_code = getattr(e, "status_code", None)
68
 
69
+ # ─── 5xx Server Errors (502, 503, etc) ───
70
+ if status_code and status_code >= 500 and status_code < 600:
71
+ logger.error(f"🚫 Gemini 5xx error (HTTP {status_code}): provider infrastructure issue")
72
+ if status_code in (502, 503):
73
+ return ErrorType.PERMANENT_FAILURE, 300 # Retry after 5 minutes
74
+ return ErrorType.PERMANENT_FAILURE, 0
75
+
76
+ # ─── Rate Limiting (429) ───
77
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
78
  body = getattr(e, "body", None)
79
  delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
80
+ if "quota" in error_msg or "exhausted" in error_msg or "resource exhausted" in error_msg:
81
+ logger.warning(f"πŸ’° Gemini quota exhausted (429): {error_msg[:80]}")
82
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
83
  else:
84
+ logger.warning(f"⏱️ Gemini rate limited (429): delaying {delay}s")
85
  return ErrorType.RATE_LIMITED, delay or 15
86
 
87
+ # ─── Not Found (404) ───
88
  if status_code == 404 or "not found" in error_msg:
89
+ logger.error(f"❌ Gemini model not found (404)")
90
  return ErrorType.MODEL_NOT_FOUND, 0
91
 
92
+ # ─── Bad Request (400) ───
93
  if status_code == 400 or "bad request" in error_msg:
94
  if "decommissioned" in error_msg or "offline" in error_msg:
95
+ logger.error(f"❌ Gemini model decommissioned (400)")
96
  return ErrorType.MODEL_DECOMMISSIONED, 0
97
 
98
+ # ─── Default: Unclassified ───
99
+ logger.warning(f"⚠️ Gemini unclassified error (HTTP {status_code}): {error_msg[:80]}")
100
  return ErrorType.PERMANENT_FAILURE, 0
101
 
102
  async def call_api(
app/services/chat/api/groq_client.py CHANGED
@@ -66,20 +66,36 @@ class GroqClient:
66
  error_msg = str(e).lower()
67
  status_code = getattr(e, "status_code", None)
68
 
 
 
 
 
 
 
 
 
69
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
70
  delay = parse_retry_after(getattr(e, "message", str(e)))
71
  if "tpd" in error_msg or "tokens per day" in error_msg or "daily" in error_msg or "insufficient_quota" in error_msg:
 
72
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
73
  else:
 
74
  return ErrorType.RATE_LIMITED, delay or 30
75
 
 
76
  if status_code == 404 or "not found" in error_msg:
 
77
  return ErrorType.MODEL_NOT_FOUND, 0
78
 
 
79
  if status_code == 400 or "bad request" in error_msg:
80
  if "decommissioned" in error_msg or "offline" in error_msg:
 
81
  return ErrorType.MODEL_DECOMMISSIONED, 0
82
 
 
 
83
  return ErrorType.PERMANENT_FAILURE, 0
84
 
85
  async def call_api(
 
66
  error_msg = str(e).lower()
67
  status_code = getattr(e, "status_code", None)
68
 
69
+ # ─── 5xx Server Errors (502, 503, etc) ───
70
+ if status_code and status_code >= 500 and status_code < 600:
71
+ logger.error(f"🚫 Groq 5xx error (HTTP {status_code}): provider infrastructure issue")
72
+ if status_code in (502, 503):
73
+ return ErrorType.PERMANENT_FAILURE, 300 # Retry after 5 minutes
74
+ return ErrorType.PERMANENT_FAILURE, 0
75
+
76
+ # ─── Rate Limiting (429) ───
77
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
78
  delay = parse_retry_after(getattr(e, "message", str(e)))
79
  if "tpd" in error_msg or "tokens per day" in error_msg or "daily" in error_msg or "insufficient_quota" in error_msg:
80
+ logger.warning(f"πŸ’° Groq quota exhausted (429): {error_msg[:80]}")
81
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
82
  else:
83
+ logger.warning(f"⏱️ Groq rate limited (429): delaying {delay}s")
84
  return ErrorType.RATE_LIMITED, delay or 30
85
 
86
+ # ─── Not Found (404) ───
87
  if status_code == 404 or "not found" in error_msg:
88
+ logger.error(f"❌ Groq model not found (404)")
89
  return ErrorType.MODEL_NOT_FOUND, 0
90
 
91
+ # ─── Bad Request (400) ───
92
  if status_code == 400 or "bad request" in error_msg:
93
  if "decommissioned" in error_msg or "offline" in error_msg:
94
+ logger.error(f"❌ Groq model decommissioned (400)")
95
  return ErrorType.MODEL_DECOMMISSIONED, 0
96
 
97
+ # ─── Default: Unclassified ───
98
+ logger.warning(f"⚠️ Groq unclassified error (HTTP {status_code}): {error_msg[:80]}")
99
  return ErrorType.PERMANENT_FAILURE, 0
100
 
101
  async def call_api(
app/services/chat/api/llm_error_classifier.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified LLM error classification logic shared across all provider clients.
3
+
4
+ Standardizes error handling and reduces code duplication.
5
+ """
6
+
7
+ import logging
8
+ from typing import Tuple
9
+ from app.services.chat.api.llm_errors import ErrorType
10
+ from app.services.chat.api.retry_parser import parse_retry_after
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def classify_llm_error(e: Exception, provider_name: str = "Unknown") -> Tuple[ErrorType, int]:
16
+ """
17
+ Centralized error classification for all LLM providers.
18
+
19
+ Returns:
20
+ Tuple[ErrorType, int]: (error_type, retry_after_seconds)
21
+ """
22
+ error_msg = str(e).lower()
23
+ status_code = getattr(e, "status_code", None)
24
+ body = getattr(e, "body", None)
25
+
26
+ # ─── 5xx Server Errors β†’ PERMANENT_FAILURE (provider is broken) ───
27
+ if status_code and status_code >= 500 and status_code < 600:
28
+ logger.error(f"[{provider_name}] 5xx error (HTTP {status_code}): likely infrastructure issue")
29
+ # Don't immediately retry 5xx β€” likely indicates provider issues
30
+ if status_code == 503: # Service Unavailable
31
+ return ErrorType.PERMANENT_FAILURE, 300 # Retry after 5 minutes
32
+ else:
33
+ return ErrorType.PERMANENT_FAILURE, 0 # Don't auto-retry
34
+
35
+ # ─── Rate Limiting (429) ───
36
+ if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
37
+ delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
38
+
39
+ # Distinguish between quota exhaustion and rate limit
40
+ if ("insufficient_quota" in error_msg or
41
+ "quota" in error_msg or
42
+ "balance" in error_msg or
43
+ "tpd" in error_msg or # Tokens per day (Groq)
44
+ "tokens per day" in error_msg or
45
+ "daily" in error_msg):
46
+ logger.warning(f"[{provider_name}] Quota exhausted (429): {error_msg[:100]}")
47
+ return ErrorType.QUOTA_EXHAUSTED, delay or 3600
48
+ else:
49
+ logger.warning(f"[{provider_name}] Rate limited (429): delaying {delay}s")
50
+ return ErrorType.RATE_LIMITED, delay or 30
51
+
52
+ # ─── Model Not Found (404) ───
53
+ if status_code == 404 or "not found" in error_msg or "model does not exist" in error_msg:
54
+ logger.error(f"[{provider_name}] Model not found (404)")
55
+ return ErrorType.MODEL_NOT_FOUND, 0
56
+
57
+ # ─── Bad Request (400) ───
58
+ if status_code == 400 or "bad request" in error_msg:
59
+ if "decommissioned" in error_msg or "offline" in error_msg or "unavailable" in error_msg:
60
+ logger.error(f"[{provider_name}] Model decommissioned/offline (400)")
61
+ return ErrorType.MODEL_DECOMMISSIONED, 0
62
+ else:
63
+ logger.error(f"[{provider_name}] Bad request (400): {error_msg[:100]}")
64
+ return ErrorType.PERMANENT_FAILURE, 0
65
+
66
+ # ─── Timeout (assumed permanent if not retryable) ───
67
+ if "timeout" in error_msg or "timed out" in error_msg:
68
+ logger.warning(f"[{provider_name}] Timeout error: {error_msg[:100]}")
69
+ return ErrorType.PERMANENT_FAILURE, 60 # Retry after 1 minute
70
+
71
+ # ─── Default: Unclassified error ───
72
+ logger.warning(f"[{provider_name}] Unclassified error: {error_msg[:100]}")
73
+ return ErrorType.PERMANENT_FAILURE, 0
74
+
75
+
76
+ def should_disable_model_permanently(error_type: ErrorType) -> bool:
77
+ """Determine if a model should be permanently disabled based on error type."""
78
+ return error_type in (
79
+ ErrorType.MODEL_NOT_FOUND,
80
+ ErrorType.MODEL_DECOMMISSIONED,
81
+ )
82
+
83
+
84
+ def should_retry_provider(error_type: ErrorType, attempt_count: int = 0) -> bool:
85
+ """
86
+ Determine if we should retry the current provider or fail over.
87
+
88
+ Args:
89
+ error_type: Type of error encountered
90
+ attempt_count: Number of attempts so far
91
+
92
+ Returns:
93
+ bool: True if should retry this provider, False if should failover
94
+ """
95
+ # Transient errors might recover with retry
96
+ if error_type == ErrorType.RATE_LIMITED and attempt_count < 2:
97
+ return True
98
+
99
+ # Quota exhaustion requires failover to another provider
100
+ if error_type == ErrorType.QUOTA_EXHAUSTED:
101
+ return False
102
+
103
+ # Permanent errors should not be retried on same provider
104
+ if error_type in (ErrorType.PERMANENT_FAILURE, ErrorType.MODEL_NOT_FOUND, ErrorType.MODEL_DECOMMISSIONED):
105
+ return False
106
+
107
+ return False
app/services/chat/api/model_validator.py CHANGED
@@ -3,9 +3,11 @@ Model Validator β€” Startup and periodic health probes for LLM models.
3
 
4
  Features:
5
  - Lightweight 1-token probes (max_tokens=1)
6
- - Probe result caching (60s TTL) β€” healthy models are not re-probed
7
- - Staggered probes (200ms between requests) to avoid burst quota hits
8
  - Separate fast path for periodic revalidation (only probes unhealthy models)
 
 
9
 
10
  Usage:
11
  Called automatically from FastAPI lifespan startup and from the
@@ -21,8 +23,10 @@ logger = logging.getLogger(__name__)
21
 
22
  # Probe result cache: { "Provider/Model": (timestamp, status) }
23
  _probe_cache: dict[str, tuple[float, str]] = {}
24
- _CACHE_TTL = 60 # seconds
25
- _PROBE_STAGGER = 0.2 # seconds between probes to avoid bursts
 
 
26
 
27
 
28
  def _is_cached_healthy(provider: str, model: str) -> bool:
@@ -44,68 +48,118 @@ def _cache_result(provider: str, model: str, status: str):
44
 
45
  async def validate_models_at_startup(revalidation_only: bool = False):
46
  """
47
- Probes each model with a 1-token request.
48
  Disables dead models BEFORE serving traffic.
49
-
 
 
 
 
 
 
50
  Args:
51
  revalidation_only: If True, only probes models that are NOT cached healthy.
52
  Used by periodic background revalidation to save quota.
53
  """
54
  from app.services.chat.api.llm_router import llm_router
55
 
56
- results = {"alive": [], "dead": [], "error": []}
57
  probe_count = 0
58
-
59
- for provider_name, client in llm_router.providers:
60
  if not getattr(client, "client", None):
 
61
  continue
62
 
63
  if hasattr(client, "refresh_dynamic_models"):
64
- await client.refresh_dynamic_models()
 
 
 
65
 
66
  models = list(client._get_all_models()) # ALL models, not just active
67
- for model in models:
 
 
 
 
 
 
 
 
 
 
 
 
68
  # Skip recently-verified healthy models during revalidation
69
  if revalidation_only and _is_cached_healthy(provider_name, model):
70
  results["alive"].append(f"{provider_name}/{model}")
 
71
  continue
72
-
73
- # Stagger probes to avoid burst rate-limit hits
74
- if probe_count > 0:
75
- await asyncio.sleep(_PROBE_STAGGER)
 
 
 
 
 
 
 
 
 
 
76
  probe_count += 1
77
 
78
  try:
79
- await client.client.chat.completions.create(
80
- model=model,
81
- messages=[{"role": "user", "content": "hi"}],
82
- max_tokens=1,
83
- timeout=10.0,
 
 
 
84
  )
85
  results["alive"].append(f"{provider_name}/{model}")
86
  _cache_result(provider_name, model, "alive")
 
87
  logger.info(f"βœ… {provider_name}/{model} β€” alive")
 
 
 
 
 
 
88
  except Exception as e:
89
- err_type, _ = client.classify_error(e)
 
90
  if err_type in (ErrorType.MODEL_NOT_FOUND, ErrorType.MODEL_DECOMMISSIONED, ErrorType.PERMANENT_FAILURE):
91
  client._permanently_disabled.add(model)
92
  results["dead"].append(f"{provider_name}/{model}")
93
  _cache_result(provider_name, model, "dead")
94
- logger.error(f"❌ {provider_name}/{model} β€” DEAD, disabled")
 
 
95
  elif err_type == ErrorType.QUOTA_EXHAUSTED:
96
  results["error"].append(f"{provider_name}/{model} (quota)")
 
97
  # Don't cache quota errors β€” they're transient
98
- logger.warning(f"⚠️ {provider_name}/{model} β€” quota issue, keeping enabled")
 
99
  else:
100
- results["error"].append(f"{provider_name}/{model} ({e})")
 
101
  logger.warning(f"⚠️ {provider_name}/{model} β€” probe error: {e}")
102
 
103
  mode = "revalidation" if revalidation_only else "startup"
104
  logger.info(
105
- f"Model validation complete ({mode}): "
106
  f"{len(results['alive'])} alive, "
107
  f"{len(results['dead'])} dead, "
108
  f"{len(results['error'])} warnings, "
 
109
  f"{probe_count} probes sent"
110
  )
111
  return results
 
3
 
4
  Features:
5
  - Lightweight 1-token probes (max_tokens=1)
6
+ - Aggressive result caching (300s TTL) β€” minimize redundant probes
7
+ - Intelligent staggering (500ms between providers, 200ms within provider)
8
  - Separate fast path for periodic revalidation (only probes unhealthy models)
9
+ - Per-provider sequential probing to avoid burst quota hits
10
+ - Automatic provider skip after N consecutive failures
11
 
12
  Usage:
13
  Called automatically from FastAPI lifespan startup and from the
 
23
 
24
  # Probe result cache: { "Provider/Model": (timestamp, status) }
25
  _probe_cache: dict[str, tuple[float, str]] = {}
26
+ _CACHE_TTL = 300 # seconds (5 minutes - aggressive caching to avoid quota hits)
27
+ _PROBE_STAGGER_WITHIN = 0.2 # seconds between models within same provider
28
+ _PROBE_STAGGER_BETWEEN = 0.5 # seconds between providers
29
+ _MAX_CONSECUTIVE_FAILURES = 5 # Skip provider if N models fail in a row
30
 
31
 
32
  def _is_cached_healthy(provider: str, model: str) -> bool:
 
48
 
49
  async def validate_models_at_startup(revalidation_only: bool = False):
50
  """
51
+ Intelligently probes each model with sequential per-provider strategy.
52
  Disables dead models BEFORE serving traffic.
53
+
54
+ Strategy:
55
+ 1. Group models by provider
56
+ 2. For each provider, probe models sequentially with within-provider stagger
57
+ 3. Skip provider after N consecutive failures (likely rate-limited)
58
+ 4. Aggressively cache results to avoid re-probing
59
+
60
  Args:
61
  revalidation_only: If True, only probes models that are NOT cached healthy.
62
  Used by periodic background revalidation to save quota.
63
  """
64
  from app.services.chat.api.llm_router import llm_router
65
 
66
+ results = {"alive": [], "dead": [], "error": [], "skipped": []}
67
  probe_count = 0
68
+
69
+ for provider_idx, (provider_name, client) in enumerate(llm_router.providers):
70
  if not getattr(client, "client", None):
71
+ logger.warning(f"⏭️ Skipping {provider_name} β€” no client")
72
  continue
73
 
74
  if hasattr(client, "refresh_dynamic_models"):
75
+ try:
76
+ await client.refresh_dynamic_models()
77
+ except Exception as e:
78
+ logger.warning(f"Failed to refresh {provider_name} models: {e}")
79
 
80
  models = list(client._get_all_models()) # ALL models, not just active
81
+ if not models:
82
+ logger.warning(f"⏭️ Skipping {provider_name} β€” no models found")
83
+ continue
84
+
85
+ logger.info(f"πŸ” Probing {provider_name} ({len(models)} models)...")
86
+
87
+ # Stagger between providers to avoid burst cross-provider
88
+ if provider_idx > 0:
89
+ await asyncio.sleep(_PROBE_STAGGER_BETWEEN)
90
+
91
+ consecutive_failures = 0
92
+
93
+ for model_idx, model in enumerate(models):
94
  # Skip recently-verified healthy models during revalidation
95
  if revalidation_only and _is_cached_healthy(provider_name, model):
96
  results["alive"].append(f"{provider_name}/{model}")
97
+ consecutive_failures = 0
98
  continue
99
+
100
+ # Skip provider if too many consecutive failures (likely quota crisis)
101
+ if consecutive_failures >= _MAX_CONSECUTIVE_FAILURES:
102
+ reason = f"Provider {provider_name} β€” {_MAX_CONSECUTIVE_FAILURES} consecutive failures, skipping remaining models"
103
+ logger.error(f"🚫 {reason}")
104
+ remaining = len(models) - model_idx
105
+ for remaining_model in models[model_idx:]:
106
+ results["skipped"].append(f"{provider_name}/{remaining_model}")
107
+ break
108
+
109
+ # Stagger within provider to avoid burst
110
+ if model_idx > 0:
111
+ await asyncio.sleep(_PROBE_STAGGER_WITHIN)
112
+
113
  probe_count += 1
114
 
115
  try:
116
+ await asyncio.wait_for(
117
+ client.client.chat.completions.create(
118
+ model=model,
119
+ messages=[{"role": "user", "content": "hi"}],
120
+ max_tokens=1,
121
+ temperature=0.5,
122
+ ),
123
+ timeout=10.0
124
  )
125
  results["alive"].append(f"{provider_name}/{model}")
126
  _cache_result(provider_name, model, "alive")
127
+ consecutive_failures = 0
128
  logger.info(f"βœ… {provider_name}/{model} β€” alive")
129
+
130
+ except asyncio.TimeoutError:
131
+ results["error"].append(f"{provider_name}/{model} (timeout)")
132
+ consecutive_failures += 1
133
+ logger.warning(f"⏱️ {provider_name}/{model} β€” timeout, probe error")
134
+
135
  except Exception as e:
136
+ err_type, retry_after = client.classify_error(e)
137
+
138
  if err_type in (ErrorType.MODEL_NOT_FOUND, ErrorType.MODEL_DECOMMISSIONED, ErrorType.PERMANENT_FAILURE):
139
  client._permanently_disabled.add(model)
140
  results["dead"].append(f"{provider_name}/{model}")
141
  _cache_result(provider_name, model, "dead")
142
+ consecutive_failures += 1
143
+ logger.error(f"❌ {provider_name}/{model} β€” DEAD, permanently disabled")
144
+
145
  elif err_type == ErrorType.QUOTA_EXHAUSTED:
146
  results["error"].append(f"{provider_name}/{model} (quota)")
147
+ consecutive_failures += 1
148
  # Don't cache quota errors β€” they're transient
149
+ logger.warning(f"⚠️ {provider_name}/{model} β€” quota issue (may recover), keeping enabled")
150
+
151
  else:
152
+ results["error"].append(f"{provider_name}/{model} ({err_type.name})")
153
+ consecutive_failures += 1
154
  logger.warning(f"⚠️ {provider_name}/{model} β€” probe error: {e}")
155
 
156
  mode = "revalidation" if revalidation_only else "startup"
157
  logger.info(
158
+ f"βœ… Model validation complete ({mode}): "
159
  f"{len(results['alive'])} alive, "
160
  f"{len(results['dead'])} dead, "
161
  f"{len(results['error'])} warnings, "
162
+ f"{len(results['skipped'])} skipped, "
163
  f"{probe_count} probes sent"
164
  )
165
  return results
app/services/chat/api/openai_client.py CHANGED
@@ -66,21 +66,37 @@ class OpenAIClient:
66
  error_msg = str(e).lower()
67
  status_code = getattr(e, "status_code", None)
68
 
 
 
 
 
 
 
 
 
69
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
70
  body = getattr(e, "body", None)
71
  delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
72
  if "insufficient_quota" in error_msg or "quota" in error_msg or "exhausted" in error_msg:
 
73
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
74
  else:
 
75
  return ErrorType.RATE_LIMITED, delay or 15
76
 
 
77
  if status_code == 404 or "not found" in error_msg:
 
78
  return ErrorType.MODEL_NOT_FOUND, 0
79
 
 
80
  if status_code == 400 or "bad request" in error_msg:
81
  if "decommissioned" in error_msg or "offline" in error_msg:
 
82
  return ErrorType.MODEL_DECOMMISSIONED, 0
83
 
 
 
84
  return ErrorType.PERMANENT_FAILURE, 0
85
 
86
  async def call_api(
 
66
  error_msg = str(e).lower()
67
  status_code = getattr(e, "status_code", None)
68
 
69
+ # ─── 5xx Server Errors (502, 503, etc) ───
70
+ if status_code and status_code >= 500 and status_code < 600:
71
+ logger.error(f"🚫 OpenAI 5xx error (HTTP {status_code}): provider infrastructure issue")
72
+ if status_code in (502, 503):
73
+ return ErrorType.PERMANENT_FAILURE, 300 # Retry after 5 minutes
74
+ return ErrorType.PERMANENT_FAILURE, 0
75
+
76
+ # ─── Rate Limiting (429) ───
77
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
78
  body = getattr(e, "body", None)
79
  delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
80
  if "insufficient_quota" in error_msg or "quota" in error_msg or "exhausted" in error_msg:
81
+ logger.warning(f"πŸ’° OpenAI quota exhausted (429): {error_msg[:80]}")
82
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
83
  else:
84
+ logger.warning(f"⏱️ OpenAI rate limited (429): delaying {delay}s")
85
  return ErrorType.RATE_LIMITED, delay or 15
86
 
87
+ # ─── Not Found (404) ───
88
  if status_code == 404 or "not found" in error_msg:
89
+ logger.error(f"❌ OpenAI model not found (404)")
90
  return ErrorType.MODEL_NOT_FOUND, 0
91
 
92
+ # ─── Bad Request (400) ───
93
  if status_code == 400 or "bad request" in error_msg:
94
  if "decommissioned" in error_msg or "offline" in error_msg:
95
+ logger.error(f"❌ OpenAI model decommissioned (400)")
96
  return ErrorType.MODEL_DECOMMISSIONED, 0
97
 
98
+ # ─── Default: Unclassified ───
99
+ logger.warning(f"⚠️ OpenAI unclassified error (HTTP {status_code}): {error_msg[:80]}")
100
  return ErrorType.PERMANENT_FAILURE, 0
101
 
102
  async def call_api(
app/services/chat/api/openrouter_client.py CHANGED
@@ -112,21 +112,38 @@ class OpenRouterClient:
112
  error_msg = str(e).lower()
113
  status_code = getattr(e, "status_code", None)
114
 
 
 
 
 
 
 
 
 
 
115
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
116
  body = getattr(e, "body", None)
117
  delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
118
  if "insufficient_quota" in error_msg or "quota" in error_msg or "balance" in error_msg:
 
119
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
120
  else:
 
121
  return ErrorType.RATE_LIMITED, delay or 15
122
 
 
123
  if status_code == 404 or "not found" in error_msg:
 
124
  return ErrorType.MODEL_NOT_FOUND, 0
125
 
 
126
  if status_code == 400 or "bad request" in error_msg:
127
  if "decommissioned" in error_msg or "offline" in error_msg:
 
128
  return ErrorType.MODEL_DECOMMISSIONED, 0
129
 
 
 
130
  return ErrorType.PERMANENT_FAILURE, 0
131
 
132
  async def call_api(
 
112
  error_msg = str(e).lower()
113
  status_code = getattr(e, "status_code", None)
114
 
115
+ # ─── 5xx Server Errors (502, 503, etc) ───
116
+ if status_code and status_code >= 500 and status_code < 600:
117
+ logger.error(f"🚫 OpenRouter 5xx error (HTTP {status_code}): provider infrastructure issue")
118
+ # 502/503 = provider is down, likely transient
119
+ if status_code in (502, 503):
120
+ return ErrorType.PERMANENT_FAILURE, 300 # Retry after 5 minutes
121
+ return ErrorType.PERMANENT_FAILURE, 0
122
+
123
+ # ─── Rate Limiting (429) ───
124
  if status_code == 429 or "rate limit" in error_msg or "ratelimit" in error_msg:
125
  body = getattr(e, "body", None)
126
  delay = parse_retry_after(body) if body else parse_retry_after(error_msg)
127
  if "insufficient_quota" in error_msg or "quota" in error_msg or "balance" in error_msg:
128
+ logger.warning(f"πŸ’° OpenRouter quota exhausted (429): {error_msg[:80]}")
129
  return ErrorType.QUOTA_EXHAUSTED, delay or 3600
130
  else:
131
+ logger.warning(f"⏱️ OpenRouter rate limited (429): delaying {delay}s")
132
  return ErrorType.RATE_LIMITED, delay or 15
133
 
134
+ # ─── Not Found (404) ───
135
  if status_code == 404 or "not found" in error_msg:
136
+ logger.error(f"❌ OpenRouter model not found (404)")
137
  return ErrorType.MODEL_NOT_FOUND, 0
138
 
139
+ # ─── Bad Request (400) ───
140
  if status_code == 400 or "bad request" in error_msg:
141
  if "decommissioned" in error_msg or "offline" in error_msg:
142
+ logger.error(f"❌ OpenRouter model decommissioned (400)")
143
  return ErrorType.MODEL_DECOMMISSIONED, 0
144
 
145
+ # ─── Default: Unclassified ───
146
+ logger.warning(f"⚠️ OpenRouter unclassified error (HTTP {status_code}): {error_msg[:80]}")
147
  return ErrorType.PERMANENT_FAILURE, 0
148
 
149
  async def call_api(
app/static/dashboard.html CHANGED
@@ -413,7 +413,11 @@ function renderSys(sys,redis,cost){
413
  function renderProviders(providers,breakers){
414
  const tb=document.querySelector('#provTable tbody'); if(!tb)return; tb.innerHTML='';
415
  let sc=0,sr=0,act=0,cnt=0;
416
- if(!providers) return;
 
 
 
 
417
  for(const[name,p]of Object.entries(providers)){
418
  cnt++;sc+=(p.score||0);sr+=(p.success_rate_window||0);
419
  const cb=(breakers&&breakers[name])||{};
@@ -455,13 +459,18 @@ function renderProviders(providers,breakers){
455
 
456
  // Render models
457
  function renderModels(models){
458
- const tb=document.querySelector('#modTable tbody');tb.innerHTML='';
459
- Object.entries(models||{}).sort((a,b)=>b[1].total_calls-a[1].total_calls).forEach(([name,m])=>{
 
 
 
 
 
460
  const ban=BANNED.includes(name),s=m.success_rate;
461
  const sc=s>=95?'t-ok':s>=70?'t-warn':'t-crit';
462
  tb.innerHTML+=`<tr style="opacity:${ban?.6:1}">
463
  <td><span class="tag" style="${ban?'text-decoration:line-through;color:var(--tm)':''}">${name}</span></td>
464
- <td class="mono"><span class="t-ok">βœ“${m.total_calls-m.total_errors}</span> <span class="t-mute">|</span> <span class="t-crit">βœ—${m.total_errors}</span></td>
465
  <td class="mono ${sc}" style="font-weight:700">${s.toFixed(1)}%</td>
466
  <td class="mono">${m.avg_latency_ms}ms</td>
467
  <td>${ban?`<button class="btn btn-ok" onclick="adminCmd('POST','/api/ai/admin/model/unban',{model_name:'${name}'})">πŸ”“ Unban</button>`:`<button class="btn btn-crit" onclick="adminCmd('POST','/api/ai/admin/model/ban',{model_name:'${name}'})">β›” Ban</button>`}</td>
 
413
  function renderProviders(providers,breakers){
414
  const tb=document.querySelector('#provTable tbody'); if(!tb)return; tb.innerHTML='';
415
  let sc=0,sr=0,act=0,cnt=0;
416
+ if(!providers || Object.keys(providers).length === 0) {
417
+ document.getElementById('kProv').textContent = '0/0';
418
+ tb.innerHTML = '<tr><td colspan="6" style="text-align:center;color:var(--tm);padding:24px;font-style:italic">No active providers found in registry. Router is starting... πŸ”„</td></tr>';
419
+ return;
420
+ }
421
  for(const[name,p]of Object.entries(providers)){
422
  cnt++;sc+=(p.score||0);sr+=(p.success_rate_window||0);
423
  const cb=(breakers&&breakers[name])||{};
 
459
 
460
  // Render models
461
  function renderModels(models){
462
+ const tb=document.querySelector('#modTable tbody'); if(!tb) return; tb.innerHTML='';
463
+ const entries = Object.entries(models||{});
464
+ if(entries.length === 0){
465
+ tb.innerHTML = '<tr><td colspan="5" style="text-align:center;color:var(--tm);padding:24px;font-style:italic">No model usage recorded in memory yet. Awaiting chat traffic... ⏳</td></tr>';
466
+ return;
467
+ }
468
+ entries.sort((a,b)=>b[1].total_calls-a[1].total_calls).forEach(([name,m])=>{
469
  const ban=BANNED.includes(name),s=m.success_rate;
470
  const sc=s>=95?'t-ok':s>=70?'t-warn':'t-crit';
471
  tb.innerHTML+=`<tr style="opacity:${ban?.6:1}">
472
  <td><span class="tag" style="${ban?'text-decoration:line-through;color:var(--tm)':''}">${name}</span></td>
473
+ <td class="mono"><span class="t-ok">βœ“${Math.max(0, m.total_calls-m.total_errors)}</span> <span class="t-mute">|</span> <span class="t-crit">βœ—${m.total_errors}</span></td>
474
  <td class="mono ${sc}" style="font-weight:700">${s.toFixed(1)}%</td>
475
  <td class="mono">${m.avg_latency_ms}ms</td>
476
  <td>${ban?`<button class="btn btn-ok" onclick="adminCmd('POST','/api/ai/admin/model/unban',{model_name:'${name}'})">πŸ”“ Unban</button>`:`<button class="btn btn-crit" onclick="adminCmd('POST','/api/ai/admin/model/ban',{model_name:'${name}'})">β›” Ban</button>`}</td>
docker-compose.yml CHANGED
@@ -1,4 +1,5 @@
1
  services:
 
2
  qdrant:
3
  image: qdrant/qdrant:latest
4
  ports:
@@ -6,19 +7,53 @@ services:
6
  volumes:
7
  - qdrant_data:/qdrant/storage
8
  restart: unless-stopped
 
 
 
 
 
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  app:
11
  build: .
12
  ports:
13
  - "7860:7860"
14
  depends_on:
15
- - qdrant
 
 
 
16
  env_file:
17
  - ../.env
18
  environment:
19
- # Override to use Docker network name instead of localhost
20
  - QDRANT_URL=http://qdrant:6333
 
21
  restart: unless-stopped
 
 
 
 
 
 
22
 
23
  volumes:
24
  qdrant_data:
 
 
1
  services:
2
+ # ─── Qdrant Vector Store ───
3
  qdrant:
4
  image: qdrant/qdrant:latest
5
  ports:
 
7
  volumes:
8
  - qdrant_data:/qdrant/storage
9
  restart: unless-stopped
10
+ healthcheck:
11
+ test: ["CMD", "curl", "-f", "http://localhost:6333/health"]
12
+ interval: 30s
13
+ timeout: 10s
14
+ retries: 3
15
+ start_period: 10s
16
 
17
+ # ─── Redis Cache (Session Storage & Rate Limit Tracking) ───
18
+ redis:
19
+ image: redis:7-alpine
20
+ ports:
21
+ - "6379:6379"
22
+ volumes:
23
+ - redis_data:/data
24
+ restart: unless-stopped
25
+ command: redis-server --appendonly yes --maxmemory 512mb --maxmemory-policy allkeys-lru
26
+ healthcheck:
27
+ test: ["CMD", "redis-cli", "ping"]
28
+ interval: 30s
29
+ timeout: 10s
30
+ retries: 3
31
+ start_period: 10s
32
+
33
+ # ─── Main Application ───
34
  app:
35
  build: .
36
  ports:
37
  - "7860:7860"
38
  depends_on:
39
+ qdrant:
40
+ condition: service_healthy
41
+ redis:
42
+ condition: service_healthy
43
  env_file:
44
  - ../.env
45
  environment:
46
+ # Use Docker network names for service discovery
47
  - QDRANT_URL=http://qdrant:6333
48
+ - REDIS_URL=redis://redis:6379/0
49
  restart: unless-stopped
50
+ healthcheck:
51
+ test: ["CMD", "curl", "-f", "http://localhost:7860/api/ai/health"]
52
+ interval: 30s
53
+ timeout: 10s
54
+ retries: 3
55
+ start_period: 30s
56
 
57
  volumes:
58
  qdrant_data:
59
+ redis_data: