MuhammadMahmoud commited on
Commit
8d492c4
·
1 Parent(s): 468ea61

F:\Aoun Grad Project\AwnAiService\AwnDeploy

Browse files
__pycache__/main.cpython-313.pyc CHANGED
Binary files a/__pycache__/main.cpython-313.pyc and b/__pycache__/main.cpython-313.pyc differ
 
app/api/__pycache__/api.cpython-313.pyc CHANGED
Binary files a/app/api/__pycache__/api.cpython-313.pyc and b/app/api/__pycache__/api.cpython-313.pyc differ
 
app/api/api.py CHANGED
@@ -1,5 +1,5 @@
1
  from fastapi import APIRouter, Depends
2
- from app.api import prediction, ocr, chat, voice, feedback, kb_admin, health
3
  from app.core.auth import verify_api_key
4
 
5
  # All routes under /api require X-API-Key header
@@ -14,3 +14,4 @@ api_router.include_router(feedback.router, prefix="/ai", tags=["ai"])
14
  api_router.include_router(kb_admin.router, prefix="/ai", tags=["ai"])
15
 
16
  public_router.include_router(health.router, prefix="/ai", tags=["ai"])
 
 
1
  from fastapi import APIRouter, Depends
2
+ from app.api import prediction, ocr, chat, voice, feedback, kb_admin, health, metrics
3
  from app.core.auth import verify_api_key
4
 
5
  # All routes under /api require X-API-Key header
 
14
  api_router.include_router(kb_admin.router, prefix="/ai", tags=["ai"])
15
 
16
  public_router.include_router(health.router, prefix="/ai", tags=["ai"])
17
+ public_router.include_router(metrics.router, prefix="/ai", tags=["metrics"])
app/core/observability.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralised Prometheus metrics and helpers.
3
+
4
+ We keep all metric definitions in one place so providers, Redis, and the
5
+ router can push to the same registry without circular imports.
6
+ """
7
+
8
+ from typing import Optional, Union
9
+
10
+ from fastapi import APIRouter
11
+ from fastapi.responses import Response
12
+ from prometheus_client import (
13
+ CONTENT_TYPE_LATEST,
14
+ Counter,
15
+ Gauge,
16
+ Histogram,
17
+ generate_latest,
18
+ )
19
+
20
+ # ─── Metric Definitions ──────────────────────────────────────────────────────
21
+
22
+ # LLM call performance
23
+ llm_latency_ms = Histogram(
24
+ "llm_latency_ms",
25
+ "End-to-end LLM latency in milliseconds",
26
+ ["provider", "model"],
27
+ buckets=[50, 100, 200, 400, 800, 1200, 1500, 2000, 3000, 5000, 8000, 12000],
28
+ )
29
+
30
+ llm_ttft_ms = Histogram(
31
+ "llm_ttft_ms",
32
+ "Time to first token (TTFT) in milliseconds",
33
+ ["provider", "model"],
34
+ buckets=[50, 100, 200, 400, 800, 1200, 1500, 2000, 3000, 5000, 8000, 12000],
35
+ )
36
+
37
+ llm_calls_total = Counter(
38
+ "llm_calls_total",
39
+ "Total LLM calls attempted per provider/model",
40
+ ["provider", "model"],
41
+ )
42
+
43
+ llm_errors_total = Counter(
44
+ "llm_errors_total",
45
+ "Total LLM errors per provider/model/error_type",
46
+ ["provider", "model", "error_type"],
47
+ )
48
+
49
+ # Circuit breaker and bulkhead health
50
+ llm_breaker_open_state = Gauge(
51
+ "llm_breaker_open_state",
52
+ "Circuit breaker state (1=open, 0.5=half-open, 0=closed)",
53
+ ["provider"],
54
+ )
55
+
56
+ llm_bulkhead_in_use = Gauge(
57
+ "llm_bulkhead_in_use",
58
+ "Concurrent in-flight LLM calls per provider",
59
+ ["provider"],
60
+ )
61
+
62
+ llm_bulkhead_capacity = Gauge(
63
+ "llm_bulkhead_capacity",
64
+ "Configured bulkhead slots per provider",
65
+ ["provider"],
66
+ )
67
+
68
+ llm_bulkhead_skips_total = Counter(
69
+ "llm_bulkhead_skips_total",
70
+ "Requests skipped because bulkhead was saturated",
71
+ ["provider"],
72
+ )
73
+
74
+ # Redis health
75
+ redis_rtt_ms = Histogram(
76
+ "redis_rtt_ms",
77
+ "Redis round-trip latency in milliseconds",
78
+ ["endpoint"],
79
+ buckets=[1, 2, 5, 10, 25, 50, 75, 100, 250, 500, 1000],
80
+ )
81
+
82
+ redis_errors_total = Counter(
83
+ "redis_errors_total",
84
+ "Redis errors grouped by endpoint and operation",
85
+ ["endpoint", "operation"],
86
+ )
87
+
88
+
89
+ # ─── Helper Functions ───────────────────────────────────────────────────────
90
+
91
+ def record_llm_success(provider: str, model: str, latency_ms: float, ttft_ms: Optional[float] = None) -> None:
92
+ llm_calls_total.labels(provider=provider, model=model).inc()
93
+ llm_latency_ms.labels(provider=provider, model=model).observe(latency_ms)
94
+ if ttft_ms is not None:
95
+ llm_ttft_ms.labels(provider=provider, model=model).observe(ttft_ms)
96
+
97
+
98
+ def record_llm_error(provider: str, model: str, error_type: str = "unknown") -> None:
99
+ llm_calls_total.labels(provider=provider, model=model).inc()
100
+ llm_errors_total.labels(provider=provider, model=model, error_type=error_type).inc()
101
+
102
+
103
+ def set_bulkhead_capacity(provider: str, capacity: int) -> None:
104
+ llm_bulkhead_capacity.labels(provider=provider).set(capacity)
105
+
106
+
107
+ def bulkhead_acquire(provider: str) -> None:
108
+ llm_bulkhead_in_use.labels(provider=provider).inc()
109
+
110
+
111
+ def bulkhead_release(provider: str) -> None:
112
+ """Decrement the in-use bulkhead gauge.
113
+
114
+ Callers (the _provider_slot context-manager) are responsible for
115
+ ensuring acquire/release are balanced. We simply decrement — Prometheus
116
+ Gauges support negative values internally and we avoid touching the
117
+ private _value attribute which is not part of the public API.
118
+ """
119
+ llm_bulkhead_in_use.labels(provider=provider).dec()
120
+
121
+
122
+ def bulkhead_skip(provider: str) -> None:
123
+ llm_bulkhead_skips_total.labels(provider=provider).inc()
124
+
125
+
126
+ def set_breaker_state(provider: str, state: Union[str, "CircuitState"]) -> None:
127
+ """
128
+ Accepts either CircuitState enum or its string value.
129
+ Closed=0, Half-open=0.5, Open=1 — matches alert rules.
130
+ """
131
+ state_val = state.value if hasattr(state, "value") else str(state)
132
+ gauge_val = 0.0
133
+ if state_val == "open":
134
+ gauge_val = 1.0
135
+ elif state_val == "half_open":
136
+ gauge_val = 0.5
137
+ llm_breaker_open_state.labels(provider=provider).set(gauge_val)
138
+
139
+
140
+ def observe_redis_rtt(endpoint: str, rtt_ms: float) -> None:
141
+ redis_rtt_ms.labels(endpoint=endpoint).observe(rtt_ms)
142
+
143
+
144
+ def record_redis_error(endpoint: str, operation: str) -> None:
145
+ redis_errors_total.labels(endpoint=endpoint, operation=operation).inc()
146
+
147
+
148
+ # ─── /metrics Endpoint (Prometheus scrape) ──────────────────────────────────
149
+
150
+ metrics_router = APIRouter()
151
+
152
+
153
+ @metrics_router.get("/metrics", include_in_schema=False)
154
+ async def prometheus_metrics() -> Response:
155
+ """Expose Prometheus metrics for Grafana/Prometheus scrapers."""
156
+ return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
157
+
app/core/redis_client.py CHANGED
@@ -1,8 +1,12 @@
1
- import redis.asyncio as redis
2
  import logging
3
  import os
 
4
  from typing import Optional, List
5
 
 
 
 
6
  logger = logging.getLogger(__name__)
7
 
8
  class RedisManager:
@@ -13,6 +17,7 @@ class RedisManager:
13
  self.last_ping_error: Optional[str] = None
14
  self.last_ping_ts: float = 0.0
15
  self._ping_task: Optional["asyncio.Task"] = None
 
16
 
17
  def _parse_urls(self) -> List[str]:
18
  """
@@ -40,12 +45,14 @@ class RedisManager:
40
  # Ping to verify
41
  await self.redis.ping()
42
  self.is_connected = True
 
43
  await self._record_ping()
44
  self._start_ping_loop()
45
  logger.info(f"Connected to Redis at {redis_url}")
46
  return
47
  except Exception as e:
48
  last_error = e
 
49
  logger.warning(f"Redis connect failed at {redis_url}: {e}")
50
 
51
  logger.error(f"Failed to connect to any Redis endpoint ({urls}). System will degrade to local memory. Last error: {last_error}")
@@ -63,34 +70,44 @@ class RedisManager:
63
  logger.info("Disconnected from Redis")
64
 
65
  async def _record_ping(self):
66
- """Measure RTT and store locally."""
67
  if not self.redis:
68
  return
69
- import time as _t
70
- start = _t.perf_counter()
71
  try:
72
  await self.redis.ping()
73
- self.last_ping_ms = (_t.perf_counter() - start) * 1000
74
  self.last_ping_error = None
75
- self.last_ping_ts = _t.time()
 
76
  except Exception as e:
77
  self.last_ping_error = str(e)
78
- self.last_ping_ts = _t.time()
 
79
 
80
  def _start_ping_loop(self):
81
- """Fire-and-forget periodic ping task for telemetry."""
 
 
 
 
 
82
  if self._ping_task:
83
  return
84
- import asyncio
85
  async def _loop():
86
- while True:
87
- await asyncio.sleep(10)
88
- await self._record_ping()
 
 
 
 
89
  try:
90
  loop = asyncio.get_running_loop()
91
  self._ping_task = loop.create_task(_loop())
92
  except RuntimeError:
93
- # no running loop; skip
94
  self._ping_task = None
95
 
96
  redis_client = RedisManager()
 
1
+ import asyncio
2
  import logging
3
  import os
4
+ import time
5
  from typing import Optional, List
6
 
7
+ import redis.asyncio as redis
8
+ from app.core.observability import observe_redis_rtt, record_redis_error
9
+
10
  logger = logging.getLogger(__name__)
11
 
12
  class RedisManager:
 
17
  self.last_ping_error: Optional[str] = None
18
  self.last_ping_ts: float = 0.0
19
  self._ping_task: Optional["asyncio.Task"] = None
20
+ self.active_endpoint: str = ""
21
 
22
  def _parse_urls(self) -> List[str]:
23
  """
 
45
  # Ping to verify
46
  await self.redis.ping()
47
  self.is_connected = True
48
+ self.active_endpoint = redis_url
49
  await self._record_ping()
50
  self._start_ping_loop()
51
  logger.info(f"Connected to Redis at {redis_url}")
52
  return
53
  except Exception as e:
54
  last_error = e
55
+ record_redis_error(redis_url, "connect")
56
  logger.warning(f"Redis connect failed at {redis_url}: {e}")
57
 
58
  logger.error(f"Failed to connect to any Redis endpoint ({urls}). System will degrade to local memory. Last error: {last_error}")
 
70
  logger.info("Disconnected from Redis")
71
 
72
  async def _record_ping(self):
73
+ """Measure RTT and record to Prometheus + local state."""
74
  if not self.redis:
75
  return
76
+ start = time.perf_counter()
 
77
  try:
78
  await self.redis.ping()
79
+ self.last_ping_ms = (time.perf_counter() - start) * 1000
80
  self.last_ping_error = None
81
+ self.last_ping_ts = time.time()
82
+ observe_redis_rtt(self.active_endpoint or "unknown", self.last_ping_ms)
83
  except Exception as e:
84
  self.last_ping_error = str(e)
85
+ self.last_ping_ts = time.time()
86
+ record_redis_error(self.active_endpoint or "unknown", "ping")
87
 
88
  def _start_ping_loop(self):
89
+ """Fire-and-forget periodic ping task for RTT telemetry.
90
+
91
+ Uses a 30-second interval to align with the Redis client's own
92
+ ``health_check_interval`` and avoid unnecessary network chatter.
93
+ Handles ``CancelledError`` silently so graceful shutdown is clean.
94
+ """
95
  if self._ping_task:
96
  return
97
+
98
  async def _loop():
99
+ try:
100
+ while True:
101
+ await asyncio.sleep(30) # aligned with health_check_interval
102
+ await self._record_ping()
103
+ except asyncio.CancelledError:
104
+ pass # clean shutdown — do not re-raise
105
+
106
  try:
107
  loop = asyncio.get_running_loop()
108
  self._ping_task = loop.create_task(_loop())
109
  except RuntimeError:
110
+ # No running event loop (sync context) — skip
111
  self._ping_task = None
112
 
113
  redis_client = RedisManager()
app/services/chat/api/circuit_breaker.py CHANGED
@@ -35,6 +35,7 @@ from dataclasses import dataclass, field
35
  from enum import Enum
36
  from typing import Optional
37
  from app.core.redis_client import redis_client
 
38
 
39
  logger = logging.getLogger(__name__)
40
 
@@ -76,6 +77,7 @@ class CircuitBreaker:
76
 
77
  def __post_init__(self):
78
  self._recovery_base = self.recovery_timeout
 
79
 
80
  # ── Redis helpers ──────────────────────────────────────────────────────
81
  def _redis_key(self) -> str:
@@ -155,6 +157,7 @@ class CircuitBreaker:
155
  if elapsed >= self.recovery_timeout:
156
  if not self._probe_in_flight:
157
  self.state = CircuitState.HALF_OPEN
 
158
  self._probe_in_flight = True
159
  logger.info(
160
  f"Circuit [{self.name}]: OPEN → HALF_OPEN "
@@ -184,6 +187,7 @@ class CircuitBreaker:
184
  f"[CircuitBreaker] {self.name} {previous.value.upper()} → CLOSED "
185
  f"(recovered, recovery_timeout reset to {self._recovery_base}s)"
186
  )
 
187
  self._schedule(self._push_state())
188
 
189
  def record_failure(self, recovery_override: Optional[int] = None):
@@ -216,11 +220,14 @@ class CircuitBreaker:
216
  self.MAX_RECOVERY_TIMEOUT
217
  )
218
  self.state = CircuitState.OPEN
 
219
  logger.warning(
220
  f"[CircuitBreaker] {self.name} → OPEN "
221
  f"(failures={self.failure_count}/{self.failure_threshold}, "
222
  f"recovery_in={self.recovery_timeout}s, cycle={self._open_cycles})"
223
  )
 
 
224
  self._schedule(self._push_state())
225
 
226
  def seconds_until_retry(self) -> int:
 
35
  from enum import Enum
36
  from typing import Optional
37
  from app.core.redis_client import redis_client
38
+ from app.core.observability import set_breaker_state
39
 
40
  logger = logging.getLogger(__name__)
41
 
 
77
 
78
  def __post_init__(self):
79
  self._recovery_base = self.recovery_timeout
80
+ set_breaker_state(self.name, self.state)
81
 
82
  # ── Redis helpers ──────────────────────────────────────────────────────
83
  def _redis_key(self) -> str:
 
157
  if elapsed >= self.recovery_timeout:
158
  if not self._probe_in_flight:
159
  self.state = CircuitState.HALF_OPEN
160
+ set_breaker_state(self.name, self.state)
161
  self._probe_in_flight = True
162
  logger.info(
163
  f"Circuit [{self.name}]: OPEN → HALF_OPEN "
 
187
  f"[CircuitBreaker] {self.name} {previous.value.upper()} → CLOSED "
188
  f"(recovered, recovery_timeout reset to {self._recovery_base}s)"
189
  )
190
+ set_breaker_state(self.name, self.state)
191
  self._schedule(self._push_state())
192
 
193
  def record_failure(self, recovery_override: Optional[int] = None):
 
220
  self.MAX_RECOVERY_TIMEOUT
221
  )
222
  self.state = CircuitState.OPEN
223
+ set_breaker_state(self.name, self.state)
224
  logger.warning(
225
  f"[CircuitBreaker] {self.name} → OPEN "
226
  f"(failures={self.failure_count}/{self.failure_threshold}, "
227
  f"recovery_in={self.recovery_timeout}s, cycle={self._open_cycles})"
228
  )
229
+ else:
230
+ set_breaker_state(self.name, self.state)
231
  self._schedule(self._push_state())
232
 
233
  def seconds_until_retry(self) -> int:
app/services/chat/api/llm_router.py CHANGED
@@ -20,10 +20,32 @@ from app.services.chat.api.circuit_breaker import circuit_registry
20
  from app.services.chat.api.model_registry import model_registry
21
  from app.core.redis_client import redis_client
22
  from app.core.telemetry import telemetry
 
 
 
 
 
 
 
 
23
  from contextlib import asynccontextmanager
24
 
25
  logger = logging.getLogger(__name__)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Graceful degradation message shown to users when all providers are unavailable
28
  _DEGRADATION_MSG_AR = (
29
  "عذراً، جميع خدمات الذكاء الاصطناعي غير متوفرة حالياً بسبب ضغط مرتفع. "
@@ -230,6 +252,7 @@ class LLMRouter:
230
  for name, _ in self.providers:
231
  self.provider_states[name] = ProviderState()
232
  budget = self._concurrency_budget.get(name)
 
233
  if budget and budget > 0:
234
  self._semaphores[name] = asyncio.Semaphore(budget)
235
  logger.info(f" Bulkhead [{name}] initialised — {budget} concurrent slots")
@@ -266,6 +289,7 @@ class LLMRouter:
266
  if sem.locked():
267
  # _value == 0: every slot is in use
268
  self._bulkhead_skips[name] += 1
 
269
  logger.debug(
270
  f"Bulkhead [{name}] saturated — 0/{budget} slots free "
271
  f"(total skips: {self._bulkhead_skips[name]})"
@@ -276,11 +300,13 @@ class LLMRouter:
276
  # Capacity is available; acquire() resolves immediately (no await needed
277
  # for a semaphore with _value > 0, but we keep await for correctness)
278
  await sem.acquire()
 
279
  logger.debug(f"Bulkhead [{name}] acquired — {sem._value}/{budget} slots remaining")
280
  try:
281
  yield True
282
  finally:
283
  sem.release()
 
284
 
285
  async def get_active_requests(self) -> int:
286
  """
@@ -469,8 +495,9 @@ class LLMRouter:
469
 
470
  cb.record_success()
471
  self.provider_states[name].record_success(latency)
472
- model_name = client._get_active_models()[0] if hasattr(client, "_get_active_models") and client._get_active_models() else "unknown"
473
  telemetry.record_success(name, model_name, latency)
 
474
 
475
  self._log_routing_decision(request_id, name, "success", latency)
476
  span.set_attribute("llm.latency_ms", latency)
@@ -478,8 +505,10 @@ class LLMRouter:
478
  return result
479
  except Exception as e:
480
  last_error = e
481
- model_name = client._get_active_models()[0] if hasattr(client, "_get_active_models") and client._get_active_models() else "unknown"
482
  telemetry.record_error(name, model_name)
 
 
483
  self._handle_provider_error(name, client, e, cb, request_id)
484
  span.record_exception(e)
485
  span.set_status(Status(StatusCode.ERROR))
@@ -503,10 +532,15 @@ class LLMRouter:
503
  latency = (time.monotonic() - start) * 1000
504
  bypass_cb.record_success()
505
  self.provider_states[bypass_name].record_success(latency)
 
 
506
  self._log_routing_decision(request_id, bypass_name, "success_emergency_bypass", latency)
507
  return result
508
  except Exception as e:
509
  last_error = e
 
 
 
510
  self._handle_provider_error(bypass_name, bypass_client, e, bypass_cb, request_id)
511
 
512
  # ATTEMPT 2: Adaptive Non-blocking fast retry
@@ -549,12 +583,16 @@ class LLMRouter:
549
 
550
  cb.record_success()
551
  self.provider_states[name].record_success(latency)
552
-
 
553
  self._log_routing_decision(request_id, name, "success_after_wait", latency)
554
  span.set_attribute("llm.latency_ms", latency)
555
  return result
556
  except Exception as e:
557
  last_error = e
 
 
 
558
  self._handle_provider_error(name, client, e, cb, request_id)
559
  span.record_exception(e)
560
  span.set_status(Status(StatusCode.ERROR))
@@ -664,6 +702,8 @@ class LLMRouter:
664
  except Exception as gen_err:
665
  logger.exception(f"[ATTEMPT 1] ❌ FAILED to create generator for {name}: {type(gen_err).__name__}: {gen_err}")
666
  last_error = gen_err
 
 
667
  raise
668
 
669
  first = True
@@ -674,8 +714,9 @@ class LLMRouter:
674
  logger.info(f"[ATTEMPT 1] ✓✓✓ SUCCESS! First chunk (TTFT) from {name} in {latency:.1f}ms")
675
  cb.record_success()
676
  self.provider_states[name].record_success(latency, ttft_ms=latency)
677
- model_name = client._get_active_models()[0] if hasattr(client, "_get_active_models") and client._get_active_models() else "unknown"
678
  telemetry.record_success(name, model_name, latency)
 
679
 
680
  self._log_routing_decision(request_id, name, "stream_started", latency)
681
  span.set_attribute("llm.latency_ms", latency)
@@ -688,8 +729,10 @@ class LLMRouter:
688
  except Exception as e:
689
  last_error = e
690
  logger.exception(f"[ATTEMPT 1] ❌ FAILED {name}: {type(e).__name__}: {e}")
691
- model_name = client._get_active_models()[0] if hasattr(client, "_get_active_models") and client._get_active_models() else "unknown"
692
  telemetry.record_error(name, model_name)
 
 
693
  self._handle_provider_error(name, client, e, cb, request_id)
694
  span.record_exception(e)
695
  span.set_status(Status(StatusCode.ERROR))
@@ -726,12 +769,17 @@ class LLMRouter:
726
  logger.info(f"[EMERGENCY BYPASS] ✓ First chunk (TTFT) from {bypass_name} in {latency:.1f}ms")
727
  bypass_cb.record_success()
728
  self.provider_states[bypass_name].record_success(latency, ttft_ms=latency)
 
 
729
  yield chunk
730
  logger.info(f"[EMERGENCY BYPASS] ✓✓✓ Stream completed via {bypass_name}")
731
  return
732
  except Exception as e:
733
  last_error = e
734
  logger.exception(f"[EMERGENCY BYPASS] ❌ {bypass_name} also failed: {e}")
 
 
 
735
  self._handle_provider_error(bypass_name, bypass_client, e, bypass_cb, request_id)
736
 
737
  # ATTEMPT 2: Adaptive Non-blocking fast retry
@@ -791,6 +839,8 @@ class LLMRouter:
791
  except Exception as gen_err:
792
  logger.exception(f"[ATTEMPT 2] ❌ Generator failed for {name}: {gen_err}")
793
  last_error = gen_err
 
 
794
  raise
795
 
796
  first = True
@@ -801,6 +851,8 @@ class LLMRouter:
801
  logger.info(f"[ATTEMPT 2] ✓✓✓ SUCCESS! First chunk (TTFT) from {name} in {latency:.1f}ms")
802
  cb.record_success()
803
  self.provider_states[name].record_success(latency, ttft_ms=latency)
 
 
804
  self._log_routing_decision(request_id, name, "stream_started_after_wait", latency)
805
  span.set_attribute("llm.latency_ms", latency)
806
  span.set_attribute("llm.ttft_ms", latency)
@@ -811,6 +863,9 @@ class LLMRouter:
811
  except Exception as e:
812
  last_error = e
813
  logger.exception(f"[ATTEMPT 2] ❌ FAILED {name}: {type(e).__name__}: {e}")
 
 
 
814
  self._handle_provider_error(name, client, e, cb, request_id)
815
  span.record_exception(e)
816
  span.set_status(Status(StatusCode.ERROR))
 
20
  from app.services.chat.api.model_registry import model_registry
21
  from app.core.redis_client import redis_client
22
  from app.core.telemetry import telemetry
23
+ from app.core.observability import (
24
+ record_llm_error,
25
+ record_llm_success,
26
+ set_bulkhead_capacity,
27
+ bulkhead_acquire,
28
+ bulkhead_release,
29
+ bulkhead_skip,
30
+ )
31
  from contextlib import asynccontextmanager
32
 
33
  logger = logging.getLogger(__name__)
34
 
35
+
36
+ def _get_model_name(client) -> str:
37
+ """Extract the first active model name from a provider client.
38
+
39
+ Centralises the repetitive ``client._get_active_models()[0] if ...``
40
+ pattern used throughout the router, with a safe fallback of ``"unknown"``.
41
+ """
42
+ try:
43
+ models = client._get_active_models() if hasattr(client, "_get_active_models") else []
44
+ return models[0] if models else "unknown"
45
+ except Exception:
46
+ return "unknown"
47
+
48
+
49
  # Graceful degradation message shown to users when all providers are unavailable
50
  _DEGRADATION_MSG_AR = (
51
  "عذراً، جميع خدمات الذكاء الاصطناعي غير متوفرة حالياً بسبب ضغط مرتفع. "
 
252
  for name, _ in self.providers:
253
  self.provider_states[name] = ProviderState()
254
  budget = self._concurrency_budget.get(name)
255
+ set_bulkhead_capacity(name, budget if budget else 0)
256
  if budget and budget > 0:
257
  self._semaphores[name] = asyncio.Semaphore(budget)
258
  logger.info(f" Bulkhead [{name}] initialised — {budget} concurrent slots")
 
289
  if sem.locked():
290
  # _value == 0: every slot is in use
291
  self._bulkhead_skips[name] += 1
292
+ bulkhead_skip(name)
293
  logger.debug(
294
  f"Bulkhead [{name}] saturated — 0/{budget} slots free "
295
  f"(total skips: {self._bulkhead_skips[name]})"
 
300
  # Capacity is available; acquire() resolves immediately (no await needed
301
  # for a semaphore with _value > 0, but we keep await for correctness)
302
  await sem.acquire()
303
+ bulkhead_acquire(name)
304
  logger.debug(f"Bulkhead [{name}] acquired — {sem._value}/{budget} slots remaining")
305
  try:
306
  yield True
307
  finally:
308
  sem.release()
309
+ bulkhead_release(name)
310
 
311
  async def get_active_requests(self) -> int:
312
  """
 
495
 
496
  cb.record_success()
497
  self.provider_states[name].record_success(latency)
498
+ model_name = _get_model_name(client)
499
  telemetry.record_success(name, model_name, latency)
500
+ record_llm_success(name, model_name, latency)
501
 
502
  self._log_routing_decision(request_id, name, "success", latency)
503
  span.set_attribute("llm.latency_ms", latency)
 
505
  return result
506
  except Exception as e:
507
  last_error = e
508
+ model_name = _get_model_name(client)
509
  telemetry.record_error(name, model_name)
510
+ error_label = e.error_type.value if isinstance(e, LLMProviderError) else type(e).__name__
511
+ record_llm_error(name, model_name, error_label)
512
  self._handle_provider_error(name, client, e, cb, request_id)
513
  span.record_exception(e)
514
  span.set_status(Status(StatusCode.ERROR))
 
532
  latency = (time.monotonic() - start) * 1000
533
  bypass_cb.record_success()
534
  self.provider_states[bypass_name].record_success(latency)
535
+ model_name = _get_model_name(bypass_client)
536
+ record_llm_success(bypass_name, model_name, latency)
537
  self._log_routing_decision(request_id, bypass_name, "success_emergency_bypass", latency)
538
  return result
539
  except Exception as e:
540
  last_error = e
541
+ model_name = _get_model_name(bypass_client)
542
+ error_label = e.error_type.value if isinstance(e, LLMProviderError) else type(e).__name__
543
+ record_llm_error(bypass_name, model_name, error_label)
544
  self._handle_provider_error(bypass_name, bypass_client, e, bypass_cb, request_id)
545
 
546
  # ATTEMPT 2: Adaptive Non-blocking fast retry
 
583
 
584
  cb.record_success()
585
  self.provider_states[name].record_success(latency)
586
+ model_name = client._get_active_models()[0] if hasattr(client, "_get_active_models") and client._get_active_models() else "unknown"
587
+ record_llm_success(name, model_name, latency)
588
  self._log_routing_decision(request_id, name, "success_after_wait", latency)
589
  span.set_attribute("llm.latency_ms", latency)
590
  return result
591
  except Exception as e:
592
  last_error = e
593
+ model_name = client._get_active_models()[0] if hasattr(client, "_get_active_models") and client._get_active_models() else "unknown"
594
+ error_label = e.error_type.value if isinstance(e, LLMProviderError) else type(e).__name__
595
+ record_llm_error(name, model_name, error_label)
596
  self._handle_provider_error(name, client, e, cb, request_id)
597
  span.record_exception(e)
598
  span.set_status(Status(StatusCode.ERROR))
 
702
  except Exception as gen_err:
703
  logger.exception(f"[ATTEMPT 1] ❌ FAILED to create generator for {name}: {type(gen_err).__name__}: {gen_err}")
704
  last_error = gen_err
705
+ model_name = _get_model_name(client)
706
+ record_llm_error(name, model_name, type(gen_err).__name__)
707
  raise
708
 
709
  first = True
 
714
  logger.info(f"[ATTEMPT 1] ✓✓✓ SUCCESS! First chunk (TTFT) from {name} in {latency:.1f}ms")
715
  cb.record_success()
716
  self.provider_states[name].record_success(latency, ttft_ms=latency)
717
+ model_name = _get_model_name(client)
718
  telemetry.record_success(name, model_name, latency)
719
+ record_llm_success(name, model_name, latency, ttft_ms=latency)
720
 
721
  self._log_routing_decision(request_id, name, "stream_started", latency)
722
  span.set_attribute("llm.latency_ms", latency)
 
729
  except Exception as e:
730
  last_error = e
731
  logger.exception(f"[ATTEMPT 1] ❌ FAILED {name}: {type(e).__name__}: {e}")
732
+ model_name = _get_model_name(client)
733
  telemetry.record_error(name, model_name)
734
+ error_label = e.error_type.value if isinstance(e, LLMProviderError) else type(e).__name__
735
+ record_llm_error(name, model_name, error_label)
736
  self._handle_provider_error(name, client, e, cb, request_id)
737
  span.record_exception(e)
738
  span.set_status(Status(StatusCode.ERROR))
 
769
  logger.info(f"[EMERGENCY BYPASS] ✓ First chunk (TTFT) from {bypass_name} in {latency:.1f}ms")
770
  bypass_cb.record_success()
771
  self.provider_states[bypass_name].record_success(latency, ttft_ms=latency)
772
+ model_name = _get_model_name(bypass_client)
773
+ record_llm_success(bypass_name, model_name, latency, ttft_ms=latency)
774
  yield chunk
775
  logger.info(f"[EMERGENCY BYPASS] ✓✓✓ Stream completed via {bypass_name}")
776
  return
777
  except Exception as e:
778
  last_error = e
779
  logger.exception(f"[EMERGENCY BYPASS] ❌ {bypass_name} also failed: {e}")
780
+ model_name = _get_model_name(bypass_client)
781
+ error_label = e.error_type.value if isinstance(e, LLMProviderError) else type(e).__name__
782
+ record_llm_error(bypass_name, model_name, error_label)
783
  self._handle_provider_error(bypass_name, bypass_client, e, bypass_cb, request_id)
784
 
785
  # ATTEMPT 2: Adaptive Non-blocking fast retry
 
839
  except Exception as gen_err:
840
  logger.exception(f"[ATTEMPT 2] ❌ Generator failed for {name}: {gen_err}")
841
  last_error = gen_err
842
+ model_name = _get_model_name(client)
843
+ record_llm_error(name, model_name, type(gen_err).__name__)
844
  raise
845
 
846
  first = True
 
851
  logger.info(f"[ATTEMPT 2] ✓✓✓ SUCCESS! First chunk (TTFT) from {name} in {latency:.1f}ms")
852
  cb.record_success()
853
  self.provider_states[name].record_success(latency, ttft_ms=latency)
854
+ model_name = _get_model_name(client)
855
+ record_llm_success(name, model_name, latency, ttft_ms=latency)
856
  self._log_routing_decision(request_id, name, "stream_started_after_wait", latency)
857
  span.set_attribute("llm.latency_ms", latency)
858
  span.set_attribute("llm.ttft_ms", latency)
 
863
  except Exception as e:
864
  last_error = e
865
  logger.exception(f"[ATTEMPT 2] ❌ FAILED {name}: {type(e).__name__}: {e}")
866
+ model_name = _get_model_name(client)
867
+ error_label = e.error_type.value if isinstance(e, LLMProviderError) else type(e).__name__
868
+ record_llm_error(name, model_name, error_label)
869
  self._handle_provider_error(name, client, e, cb, request_id)
870
  span.record_exception(e)
871
  span.set_status(Status(StatusCode.ERROR))
main.py CHANGED
@@ -7,12 +7,15 @@ import time
7
  from fastapi import FastAPI, Request
8
  from fastapi.responses import RedirectResponse, JSONResponse
9
  from fastapi.middleware.gzip import GZipMiddleware
 
 
10
  from contextlib import asynccontextmanager
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from app.core.config import settings
13
  from app.api.api import api_router
14
  from app.core.redis_client import redis_client
15
  from app.core.otel import init_otel
 
16
 
17
  # ─── Logging ──────────────────────────────────────────────────────────────────
18
 
@@ -122,6 +125,7 @@ def create_application() -> FastAPI:
122
  from app.api.api import public_router
123
  application.include_router(api_router, prefix="/api")
124
  application.include_router(public_router, prefix="/api")
 
125
 
126
  @application.get("/", include_in_schema=False)
127
  async def root():
@@ -169,6 +173,16 @@ def create_application() -> FastAPI:
169
  )
170
  return response
171
 
 
 
 
 
 
 
 
 
 
 
172
  return application
173
 
174
 
 
7
  from fastapi import FastAPI, Request
8
  from fastapi.responses import RedirectResponse, JSONResponse
9
  from fastapi.middleware.gzip import GZipMiddleware
10
+ from prometheus_fastapi_instrumentator import Instrumentator
11
+ from prometheus_client import REGISTRY
12
  from contextlib import asynccontextmanager
13
  from fastapi.middleware.cors import CORSMiddleware
14
  from app.core.config import settings
15
  from app.api.api import api_router
16
  from app.core.redis_client import redis_client
17
  from app.core.otel import init_otel
18
+ from app.core.observability import metrics_router # exposes /metrics with all custom + instrumentator metrics
19
 
20
  # ─── Logging ──────────────────────────────────────────────────────────────────
21
 
 
125
  from app.api.api import public_router
126
  application.include_router(api_router, prefix="/api")
127
  application.include_router(public_router, prefix="/api")
128
+ application.include_router(metrics_router) # /metrics for Prometheus/Grafana
129
 
130
  @application.get("/", include_in_schema=False)
131
  async def root():
 
173
  )
174
  return response
175
 
176
+ # Prometheus HTTP instrumentation (path-level latency, throughput, errors).
177
+ # We call instrument() to register the collection middleware but do NOT call
178
+ # expose() — our custom metrics_router at /metrics already calls
179
+ # generate_latest() which reads from the default registry, so instrumentator
180
+ # metrics (http_request_duration_seconds, etc.) are included automatically.
181
+ Instrumentator(
182
+ should_group_status_codes=True,
183
+ excluded_handlers={"/metrics", "/health", "/"},
184
+ ).instrument(application)
185
+
186
  return application
187
 
188
 
monitoring/alert_rules.yml CHANGED
@@ -55,3 +55,15 @@ groups:
55
  annotations:
56
  summary: "p95 latency above SLO"
57
  description: "p95 latency >1.5s for provider {{ $labels.provider }} for 10m."
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  annotations:
56
  summary: "p95 latency above SLO"
57
  description: "p95 latency >1.5s for provider {{ $labels.provider }} for 10m."
58
+
59
+ - alert: AllProvidersDown
60
+ expr: count(llm_breaker_open_state == 1) >= count(llm_breaker_open_state)
61
+ for: 1m
62
+ labels:
63
+ severity: critical
64
+ annotations:
65
+ summary: "ALL LLM providers are down"
66
+ description: >
67
+ Every registered LLM provider circuit breaker is OPEN.
68
+ The service is fully degraded and returning cached/fallback responses only.
69
+ Investigate upstream API failures immediately.
monitoring/grafana_dashboard.json CHANGED
@@ -1,108 +1,427 @@
1
  {
2
- "title": "Awn AI Routing — Service Overview",
3
- "tags": ["awn", "llm", "routing"],
 
 
4
  "timezone": "browser",
5
  "schemaVersion": 38,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "panels": [
7
  {
 
8
  "type": "row",
9
- "title": "Latency",
10
- "panels": [
11
- {
12
- "type": "timeseries",
13
- "title": "LLM Latency p95",
14
- "targets": [
15
- { "expr": "histogram_quantile(0.95, sum by (le,provider,model) (rate(llm_latency_ms_bucket[5m])))", "legendFormat": "{{provider}}/{{model}}" }
16
- ]
17
- },
 
 
 
 
 
18
  {
19
- "type": "timeseries",
20
- "title": "LLM Latency p99",
21
- "targets": [
22
- { "expr": "histogram_quantile(0.99, sum by (le,provider,model) (rate(llm_latency_ms_bucket[5m])))", "legendFormat": "{{provider}}/{{model}}" }
23
- ]
24
  }
25
  ]
26
  },
27
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "type": "row",
29
- "title": "Reliability",
30
- "panels": [
31
- {
32
- "type": "timeseries",
33
- "title": "Error Rate",
34
- "targets": [
35
- { "expr": "sum by (provider,model) (rate(llm_errors_total[5m])) / sum by (provider,model) (rate(llm_calls_total[5m]))", "legendFormat": "{{provider}}/{{model}}" }
36
- ]
37
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  {
39
- "type": "timeseries",
40
- "title": "Breaker State (OPEN=1)",
41
- "targets": [
42
- { "expr": "llm_breaker_open_state", "legendFormat": "{{provider}}" }
43
- ]
44
  }
45
  ]
46
  },
47
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  "type": "row",
49
- "title": "Capacity",
50
- "panels": [
51
- {
52
- "type": "timeseries",
53
- "title": "Bulkhead Utilization",
54
- "targets": [
55
- { "expr": "llm_bulkhead_in_use / llm_bulkhead_capacity", "legendFormat": "{{provider}}" }
56
- ]
57
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  {
59
- "type": "timeseries",
60
- "title": "Bulkhead Skips",
61
- "targets": [
62
- { "expr": "rate(llm_bulkhead_skips_total[5m])", "legendFormat": "{{provider}}" }
63
- ]
64
  }
65
  ]
66
  },
 
67
  {
 
68
  "type": "row",
69
- "title": "Redis Health",
70
- "panels": [
71
- {
72
- "type": "timeseries",
73
- "title": "Redis RTT p95",
74
- "targets": [
75
- { "expr": "histogram_quantile(0.95, sum by (le) (rate(redis_rtt_ms_bucket[5m])))", "legendFormat": "rtt p95" }
76
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  },
78
  {
79
- "type": "timeseries",
80
- "title": "Redis Errors",
81
- "targets": [
82
- { "expr": "rate(redis_errors_total[5m])", "legendFormat": "errors" }
83
- ]
84
  }
85
  ]
86
- }
87
- ],
88
- "templating": {
89
- "list": [
90
- {
91
- "type": "query",
92
- "name": "provider",
93
- "datasource": null,
94
- "query": "label_values(llm_calls_total, provider)",
95
- "refresh": 2
 
 
 
 
 
 
 
 
 
96
  },
97
- {
98
- "type": "query",
99
- "name": "model",
100
- "datasource": null,
101
- "query": "label_values(llm_calls_total, model)",
102
- "refresh": 2
103
- }
104
- ]
105
- },
106
- "time": { "from": "now-24h", "to": "now" },
107
- "refresh": "30s"
 
 
 
108
  }
 
1
  {
2
+ "id": null,
3
+ "uid": "awn-ai-ops-v2",
4
+ "title": "Awn AI Service — Operational Dashboard",
5
+ "tags": ["awn", "llm", "observability"],
6
  "timezone": "browser",
7
  "schemaVersion": 38,
8
+ "refresh": "30s",
9
+ "time": { "from": "now-24h", "to": "now" },
10
+ "templating": {
11
+ "list": [
12
+ {
13
+ "type": "datasource",
14
+ "name": "datasource",
15
+ "label": "Prometheus",
16
+ "pluginId": "prometheus",
17
+ "refresh": 1
18
+ },
19
+ {
20
+ "type": "query",
21
+ "name": "provider",
22
+ "label": "Provider",
23
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
24
+ "query": "label_values(llm_calls_total, provider)",
25
+ "refresh": 2,
26
+ "includeAll": true,
27
+ "multi": true
28
+ },
29
+ {
30
+ "type": "query",
31
+ "name": "model",
32
+ "label": "Model",
33
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
34
+ "query": "label_values(llm_calls_total, model)",
35
+ "refresh": 2,
36
+ "includeAll": true,
37
+ "multi": true
38
+ }
39
+ ]
40
+ },
41
  "panels": [
42
  {
43
+ "id": 1,
44
  "type": "row",
45
+ "title": "🌐 Traffic & HTTP",
46
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
47
+ "collapsed": false
48
+ },
49
+ {
50
+ "id": 2,
51
+ "type": "timeseries",
52
+ "title": "Requests per second",
53
+ "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 },
54
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
55
+ "fieldConfig": {
56
+ "defaults": { "unit": "reqps", "color": { "mode": "palette-classic" } }
57
+ },
58
+ "targets": [
59
  {
60
+ "expr": "sum(rate(http_requests_total{handler!=\"/metrics\"}[5m])) by (method)",
61
+ "legendFormat": "{{method}}",
62
+ "refId": "A"
 
 
63
  }
64
  ]
65
  },
66
  {
67
+ "id": 3,
68
+ "type": "timeseries",
69
+ "title": "HTTP 5xx error ratio",
70
+ "gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 },
71
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
72
+ "fieldConfig": {
73
+ "defaults": {
74
+ "unit": "percentunit",
75
+ "color": { "mode": "fixed", "fixedColor": "red" },
76
+ "thresholds": {
77
+ "mode": "absolute",
78
+ "steps": [
79
+ { "color": "green", "value": null },
80
+ { "color": "yellow", "value": 0.01 },
81
+ { "color": "red", "value": 0.05 }
82
+ ]
83
+ }
84
+ }
85
+ },
86
+ "targets": [
87
+ {
88
+ "expr": "sum(rate(http_requests_total{status=~\"5..\",handler!=\"/metrics\"}[5m])) / sum(rate(http_requests_total{handler!=\"/metrics\"}[5m]))",
89
+ "legendFormat": "5xx ratio",
90
+ "refId": "A"
91
+ }
92
+ ]
93
+ },
94
+ {
95
+ "id": 4,
96
+ "type": "timeseries",
97
+ "title": "HTTP latency p95 (by route)",
98
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 1 },
99
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
100
+ "fieldConfig": {
101
+ "defaults": { "unit": "s" }
102
+ },
103
+ "targets": [
104
+ {
105
+ "expr": "histogram_quantile(0.95, sum by (le,handler) (rate(http_request_duration_seconds_bucket{handler!=\"/metrics\"}[5m])))",
106
+ "legendFormat": "{{handler}}",
107
+ "refId": "A"
108
+ }
109
+ ]
110
+ },
111
+
112
+ {
113
+ "id": 10,
114
  "type": "row",
115
+ "title": "🤖 LLM Performance",
116
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 },
117
+ "collapsed": false
118
+ },
119
+ {
120
+ "id": 11,
121
+ "type": "timeseries",
122
+ "title": "LLM end-to-end latency p95",
123
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 },
124
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
125
+ "fieldConfig": {
126
+ "defaults": {
127
+ "unit": "ms",
128
+ "thresholds": {
129
+ "mode": "absolute",
130
+ "steps": [
131
+ { "color": "green", "value": null },
132
+ { "color": "yellow", "value": 800 },
133
+ { "color": "red", "value": 1500 }
134
+ ]
135
+ }
136
+ }
137
+ },
138
+ "targets": [
139
+ {
140
+ "expr": "histogram_quantile(0.95, sum by (le,provider,model) (rate(llm_latency_ms_bucket[5m])))",
141
+ "legendFormat": "{{provider}}/{{model}}",
142
+ "refId": "A"
143
+ }
144
+ ]
145
+ },
146
+ {
147
+ "id": 12,
148
+ "type": "timeseries",
149
+ "title": "Time-to-First-Token p95 (TTFT)",
150
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 },
151
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
152
+ "fieldConfig": {
153
+ "defaults": {
154
+ "unit": "ms",
155
+ "thresholds": {
156
+ "mode": "absolute",
157
+ "steps": [
158
+ { "color": "green", "value": null },
159
+ { "color": "yellow", "value": 500 },
160
+ { "color": "red", "value": 1200 }
161
+ ]
162
+ }
163
+ }
164
+ },
165
+ "targets": [
166
  {
167
+ "expr": "histogram_quantile(0.95, sum by (le,provider,model) (rate(llm_ttft_ms_bucket[5m])))",
168
+ "legendFormat": "{{provider}}/{{model}}",
169
+ "refId": "A"
 
 
170
  }
171
  ]
172
  },
173
  {
174
+ "id": 13,
175
+ "type": "timeseries",
176
+ "title": "LLM error rate by provider",
177
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 },
178
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
179
+ "fieldConfig": {
180
+ "defaults": {
181
+ "unit": "percentunit",
182
+ "thresholds": {
183
+ "mode": "absolute",
184
+ "steps": [
185
+ { "color": "green", "value": null },
186
+ { "color": "yellow", "value": 0.02 },
187
+ { "color": "red", "value": 0.1 }
188
+ ]
189
+ }
190
+ }
191
+ },
192
+ "targets": [
193
+ {
194
+ "expr": "sum by (provider) (rate(llm_errors_total[5m])) / sum by (provider) (rate(llm_calls_total[5m]))",
195
+ "legendFormat": "{{provider}}",
196
+ "refId": "A"
197
+ }
198
+ ]
199
+ },
200
+ {
201
+ "id": 14,
202
+ "type": "timeseries",
203
+ "title": "LLM call volume",
204
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 },
205
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
206
+ "fieldConfig": {
207
+ "defaults": { "unit": "reqps" }
208
+ },
209
+ "targets": [
210
+ {
211
+ "expr": "sum(rate(llm_calls_total[5m])) by (provider,model)",
212
+ "legendFormat": "{{provider}}/{{model}}",
213
+ "refId": "A"
214
+ }
215
+ ]
216
+ },
217
+
218
+ {
219
+ "id": 20,
220
  "type": "row",
221
+ "title": "🛡 Resilience — Circuit Breakers & Bulkhead",
222
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
223
+ "collapsed": false
224
+ },
225
+ {
226
+ "id": 21,
227
+ "type": "timeseries",
228
+ "title": "Circuit breaker state (0=closed, 0.5=half-open, 1=open)",
229
+ "gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 },
230
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
231
+ "fieldConfig": {
232
+ "defaults": {
233
+ "min": 0,
234
+ "max": 1,
235
+ "thresholds": {
236
+ "mode": "absolute",
237
+ "steps": [
238
+ { "color": "green", "value": null },
239
+ { "color": "yellow", "value": 0.4 },
240
+ { "color": "red", "value": 0.9 }
241
+ ]
242
+ }
243
+ }
244
+ },
245
+ "targets": [
246
+ {
247
+ "expr": "llm_breaker_open_state",
248
+ "legendFormat": "{{provider}}",
249
+ "refId": "A"
250
+ }
251
+ ]
252
+ },
253
+ {
254
+ "id": 22,
255
+ "type": "timeseries",
256
+ "title": "Bulkhead utilization (in_use / capacity)",
257
+ "gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 },
258
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
259
+ "fieldConfig": {
260
+ "defaults": {
261
+ "unit": "percentunit",
262
+ "min": 0,
263
+ "max": 1,
264
+ "thresholds": {
265
+ "mode": "absolute",
266
+ "steps": [
267
+ { "color": "green", "value": null },
268
+ { "color": "yellow", "value": 0.7 },
269
+ { "color": "red", "value": 0.9 }
270
+ ]
271
+ }
272
+ }
273
+ },
274
+ "targets": [
275
+ {
276
+ "expr": "llm_bulkhead_in_use / llm_bulkhead_capacity",
277
+ "legendFormat": "{{provider}}",
278
+ "refId": "A"
279
+ }
280
+ ]
281
+ },
282
+ {
283
+ "id": 23,
284
+ "type": "timeseries",
285
+ "title": "Bulkhead skips/s (shed requests)",
286
+ "gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 },
287
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
288
+ "fieldConfig": {
289
+ "defaults": { "unit": "reqps", "color": { "mode": "fixed", "fixedColor": "orange" } }
290
+ },
291
+ "targets": [
292
  {
293
+ "expr": "rate(llm_bulkhead_skips_total[5m])",
294
+ "legendFormat": "{{provider}}",
295
+ "refId": "A"
 
 
296
  }
297
  ]
298
  },
299
+
300
  {
301
+ "id": 30,
302
  "type": "row",
303
+ "title": "🔴 Redis Health",
304
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 },
305
+ "collapsed": false
306
+ },
307
+ {
308
+ "id": 31,
309
+ "type": "timeseries",
310
+ "title": "Redis RTT p95",
311
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 },
312
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
313
+ "fieldConfig": {
314
+ "defaults": {
315
+ "unit": "ms",
316
+ "thresholds": {
317
+ "mode": "absolute",
318
+ "steps": [
319
+ { "color": "green", "value": null },
320
+ { "color": "yellow", "value": 20 },
321
+ { "color": "red", "value": 50 }
322
+ ]
323
+ }
324
+ }
325
+ },
326
+ "targets": [
327
+ {
328
+ "expr": "histogram_quantile(0.95, sum by (le,endpoint) (rate(redis_rtt_ms_bucket[5m])))",
329
+ "legendFormat": "{{endpoint}} p95",
330
+ "refId": "A"
331
+ }
332
+ ]
333
+ },
334
+ {
335
+ "id": 32,
336
+ "type": "timeseries",
337
+ "title": "Redis errors/s",
338
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 },
339
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
340
+ "fieldConfig": {
341
+ "defaults": { "unit": "reqps", "color": { "mode": "fixed", "fixedColor": "red" } }
342
+ },
343
+ "targets": [
344
+ {
345
+ "expr": "rate(redis_errors_total[5m])",
346
+ "legendFormat": "{{endpoint}} / {{operation}}",
347
+ "refId": "A"
348
+ }
349
+ ]
350
+ },
351
+
352
+ {
353
+ "id": 40,
354
+ "type": "row",
355
+ "title": "📊 SLO Watch",
356
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 44 },
357
+ "collapsed": false
358
+ },
359
+ {
360
+ "id": 41,
361
+ "type": "timeseries",
362
+ "title": "p95 latency vs SLO (threshold = 1500ms)",
363
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 45 },
364
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
365
+ "fieldConfig": {
366
+ "defaults": {
367
+ "unit": "ms",
368
+ "thresholds": {
369
+ "mode": "absolute",
370
+ "steps": [
371
+ { "color": "green", "value": null },
372
+ { "color": "yellow", "value": 800 },
373
+ { "color": "red", "value": 1500 }
374
+ ]
375
+ },
376
+ "custom": {
377
+ "thresholdsStyle": { "mode": "line+area" }
378
+ }
379
+ }
380
+ },
381
+ "targets": [
382
+ {
383
+ "expr": "histogram_quantile(0.95, sum by (le,provider) (rate(llm_latency_ms_bucket[5m])))",
384
+ "legendFormat": "{{provider}} p95",
385
+ "refId": "A"
386
  },
387
  {
388
+ "expr": "vector(1500)",
389
+ "legendFormat": "SLO limit (1500ms)",
390
+ "refId": "B"
 
 
391
  }
392
  ]
393
+ },
394
+ {
395
+ "id": 42,
396
+ "type": "timeseries",
397
+ "title": "Error budget burn rate (5m window)",
398
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 45 },
399
+ "datasource": { "type": "prometheus", "uid": "${datasource}" },
400
+ "fieldConfig": {
401
+ "defaults": {
402
+ "unit": "percentunit",
403
+ "thresholds": {
404
+ "mode": "absolute",
405
+ "steps": [
406
+ { "color": "green", "value": null },
407
+ { "color": "yellow", "value": 0.02 },
408
+ { "color": "red", "value": 0.05 }
409
+ ]
410
+ }
411
+ }
412
  },
413
+ "targets": [
414
+ {
415
+ "expr": "sum by (provider) (rate(llm_errors_total[5m])) / sum by (provider) (rate(llm_calls_total[5m]))",
416
+ "legendFormat": "{{provider}} error rate",
417
+ "refId": "A"
418
+ },
419
+ {
420
+ "expr": "vector(0.02)",
421
+ "legendFormat": "Error budget (2%)",
422
+ "refId": "B"
423
+ }
424
+ ]
425
+ }
426
+ ]
427
  }
requirements.txt CHANGED
@@ -3,6 +3,8 @@ uvicorn[standard]>=0.34.0,<1.0.0
3
  pydantic-settings>=2.0.0,<3.0.0
4
  python-multipart>=0.0.9
5
  python-dotenv>=1.0.0,<2.0.0
 
 
6
  joblib>=1.4.0,<2.0.0
7
  scikit-learn==1.6.1
8
  xgboost>=2.1.0,<3.0.0
 
3
  pydantic-settings>=2.0.0,<3.0.0
4
  python-multipart>=0.0.9
5
  python-dotenv>=1.0.0,<2.0.0
6
+ prometheus-client>=0.20.0,<1.0.0
7
+ prometheus-fastapi-instrumentator>=7.0.0,<8.0.0
8
  joblib>=1.4.0,<2.0.0
9
  scikit-learn==1.6.1
10
  xgboost>=2.1.0,<3.0.0