Melika Kheirieh commited on
Commit
666306b
·
1 Parent(s): cf4af3c

feat(metrics): enable repair tracking and Prometheus lifecycle for observability

Browse files
app/routers/nl2sql.py CHANGED
@@ -108,7 +108,7 @@ def _cache_gc(now: float) -> None:
108
  for k, (ts, _) in list(_CACHE.items()):
109
  if now - ts > _CACHE_TTL:
110
  _CACHE.pop(k, None)
111
- # size eviction (ساده)
112
  while len(_CACHE) > _CACHE_MAX:
113
  _CACHE.pop(next(iter(_CACHE)), None)
114
 
 
108
  for k, (ts, _) in list(_CACHE.items()):
109
  if now - ts > _CACHE_TTL:
110
  _CACHE.pop(k, None)
111
+ # size eviction
112
  while len(_CACHE) > _CACHE_MAX:
113
  _CACHE.pop(next(iter(_CACHE)), None)
114
 
docker-compose.yml CHANGED
@@ -1,40 +1,31 @@
1
- version: "3.9"
2
-
3
  services:
4
- postgres:
5
- image: postgres:16
6
- container_name: nl2sql_pg
7
- environment:
8
- POSTGRES_USER: postgres
9
- POSTGRES_PASSWORD: postgres
10
- POSTGRES_DB: demo
11
- volumes:
12
- - pgdata:/var/lib/postgresql/data
13
- - ./infra/migrate.sql:/docker-entrypoint-initdb.d/00_init.sql:ro
14
- ports:
15
- - "5432:5432"
16
- healthcheck:
17
- test: ["CMD-SHELL", "pg_isready -U postgres -d demo"]
18
- interval: 5s
19
- timeout: 3s
20
- retries: 10
21
-
22
  api:
23
- build:
24
- context: .
25
- dockerfile: Dockerfile
26
- container_name: nl2sql_api
27
- depends_on:
28
- postgres:
29
- condition: service_healthy
30
  environment:
31
- DB_MODE: postgres
32
  POSTGRES_DSN: dbname=demo user=postgres password=postgres host=postgres port=5432
33
- OPENAI_MODEL_ID: gpt-4o-mini
34
  OPENAI_API_KEY: ${OPENAI_API_KEY}
35
  ports:
36
  - "8000:8000"
37
  command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers"]
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  volumes:
40
- pgdata:
 
 
 
1
  services:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  api:
 
 
 
 
 
 
 
3
  environment:
 
4
  POSTGRES_DSN: dbname=demo user=postgres password=postgres host=postgres port=5432
5
+ OPENAI_MODEL: gpt-4o-mini
6
  OPENAI_API_KEY: ${OPENAI_API_KEY}
7
  ports:
8
  - "8000:8000"
9
  command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers"]
10
 
11
+ prometheus:
12
+ image: prom/prometheus:latest
13
+ container_name: prometheus
14
+ restart: unless-stopped
15
+ ports:
16
+ - "9090:9090"
17
+ volumes:
18
+ # 📘 Prometheus config
19
+ - ./provisioning/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
20
+ # 📘 Rules (recording + alert rules)
21
+ - ./provisioning/prometheus/rules.yml:/etc/prometheus/rules.yml:ro
22
+ # 📘 Persistent TSDB storage
23
+ - prometheus_data:/prometheus
24
+ command:
25
+ - "--config.file=/etc/prometheus/prometheus.yml"
26
+ - "--storage.tsdb.path=/prometheus"
27
+ - "--web.enable-lifecycle"
28
+ - "--web.enable-admin-api"
29
+
30
  volumes:
31
+ prometheus_data:
nl2sql/metrics.py CHANGED
@@ -55,7 +55,7 @@ verifier_failures_total = Counter(
55
  repair_attempts_total = Counter(
56
  "repair_attempts_total",
57
  "Number of repair loop attempts",
58
- ["outcome"], # success | failed
59
  registry=REGISTRY,
60
  )
61
 
 
55
  repair_attempts_total = Counter(
56
  "repair_attempts_total",
57
  "Number of repair loop attempts",
58
+ ["outcome"], # attempt | success | failed
59
  registry=REGISTRY,
60
  )
61
 
nl2sql/pipeline.py CHANGED
@@ -305,6 +305,7 @@ class Pipeline:
305
  if not verified:
306
  for _attempt in range(2):
307
  # repair
 
308
  t0 = time.perf_counter()
309
  r_fix = self._safe_stage(
310
  self.repair.run,
 
305
  if not verified:
306
  for _attempt in range(2):
307
  # repair
308
+ repair_attempts_total.labels(outcome="attempt").inc()
309
  t0 = time.perf_counter()
310
  r_fix = self._safe_stage(
311
  self.repair.run,
prometheus/rules.yml CHANGED
@@ -23,11 +23,8 @@ groups:
23
  # repair success rate (0..1)
24
  - record: nl2sql:repair_success_rate
25
  expr: |
26
- (
27
- sum(rate(repair_attempts_total{outcome="success"}[5m]))
28
- )
29
- /
30
- clamp_min(sum(rate(repair_attempts_total[5m])), 1)
31
 
32
  # cache hit ratio (0..1)
33
  - record: nl2sql:cache_hit_ratio
 
23
  # repair success rate (0..1)
24
  - record: nl2sql:repair_success_rate
25
  expr: |
26
+ (sum(increase(repair_attempts_total{outcome="success"}[30m]))) /
27
+ clamp_min(sum(increase(repair_attempts_total[30m])), 1)
 
 
 
28
 
29
  # cache hit ratio (0..1)
30
  - record: nl2sql:cache_hit_ratio