Spaces:
Running
Running
Melika Kheirieh
commited on
Commit
·
666306b
1
Parent(s):
cf4af3c
feat(metrics): enable repair tracking and Prometheus lifecycle for observability
Browse files- app/routers/nl2sql.py +1 -1
- docker-compose.yml +21 -30
- nl2sql/metrics.py +1 -1
- nl2sql/pipeline.py +1 -0
- prometheus/rules.yml +2 -5
app/routers/nl2sql.py
CHANGED
|
@@ -108,7 +108,7 @@ def _cache_gc(now: float) -> None:
|
|
| 108 |
for k, (ts, _) in list(_CACHE.items()):
|
| 109 |
if now - ts > _CACHE_TTL:
|
| 110 |
_CACHE.pop(k, None)
|
| 111 |
-
# size eviction
|
| 112 |
while len(_CACHE) > _CACHE_MAX:
|
| 113 |
_CACHE.pop(next(iter(_CACHE)), None)
|
| 114 |
|
|
|
|
| 108 |
for k, (ts, _) in list(_CACHE.items()):
|
| 109 |
if now - ts > _CACHE_TTL:
|
| 110 |
_CACHE.pop(k, None)
|
| 111 |
+
# size eviction
|
| 112 |
while len(_CACHE) > _CACHE_MAX:
|
| 113 |
_CACHE.pop(next(iter(_CACHE)), None)
|
| 114 |
|
docker-compose.yml
CHANGED
|
@@ -1,40 +1,31 @@
|
|
| 1 |
-
version: "3.9"
|
| 2 |
-
|
| 3 |
services:
|
| 4 |
-
postgres:
|
| 5 |
-
image: postgres:16
|
| 6 |
-
container_name: nl2sql_pg
|
| 7 |
-
environment:
|
| 8 |
-
POSTGRES_USER: postgres
|
| 9 |
-
POSTGRES_PASSWORD: postgres
|
| 10 |
-
POSTGRES_DB: demo
|
| 11 |
-
volumes:
|
| 12 |
-
- pgdata:/var/lib/postgresql/data
|
| 13 |
-
- ./infra/migrate.sql:/docker-entrypoint-initdb.d/00_init.sql:ro
|
| 14 |
-
ports:
|
| 15 |
-
- "5432:5432"
|
| 16 |
-
healthcheck:
|
| 17 |
-
test: ["CMD-SHELL", "pg_isready -U postgres -d demo"]
|
| 18 |
-
interval: 5s
|
| 19 |
-
timeout: 3s
|
| 20 |
-
retries: 10
|
| 21 |
-
|
| 22 |
api:
|
| 23 |
-
build:
|
| 24 |
-
context: .
|
| 25 |
-
dockerfile: Dockerfile
|
| 26 |
-
container_name: nl2sql_api
|
| 27 |
-
depends_on:
|
| 28 |
-
postgres:
|
| 29 |
-
condition: service_healthy
|
| 30 |
environment:
|
| 31 |
-
DB_MODE: postgres
|
| 32 |
POSTGRES_DSN: dbname=demo user=postgres password=postgres host=postgres port=5432
|
| 33 |
-
|
| 34 |
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
| 35 |
ports:
|
| 36 |
- "8000:8000"
|
| 37 |
command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers"]
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
volumes:
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 1 |
services:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
api:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
environment:
|
|
|
|
| 4 |
POSTGRES_DSN: dbname=demo user=postgres password=postgres host=postgres port=5432
|
| 5 |
+
OPENAI_MODEL: gpt-4o-mini
|
| 6 |
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
| 7 |
ports:
|
| 8 |
- "8000:8000"
|
| 9 |
command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers"]
|
| 10 |
|
| 11 |
+
prometheus:
|
| 12 |
+
image: prom/prometheus:latest
|
| 13 |
+
container_name: prometheus
|
| 14 |
+
restart: unless-stopped
|
| 15 |
+
ports:
|
| 16 |
+
- "9090:9090"
|
| 17 |
+
volumes:
|
| 18 |
+
# 📘 Prometheus config
|
| 19 |
+
- ./provisioning/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
| 20 |
+
# 📘 Rules (recording + alert rules)
|
| 21 |
+
- ./provisioning/prometheus/rules.yml:/etc/prometheus/rules.yml:ro
|
| 22 |
+
# 📘 Persistent TSDB storage
|
| 23 |
+
- prometheus_data:/prometheus
|
| 24 |
+
command:
|
| 25 |
+
- "--config.file=/etc/prometheus/prometheus.yml"
|
| 26 |
+
- "--storage.tsdb.path=/prometheus"
|
| 27 |
+
- "--web.enable-lifecycle"
|
| 28 |
+
- "--web.enable-admin-api"
|
| 29 |
+
|
| 30 |
volumes:
|
| 31 |
+
prometheus_data:
|
nl2sql/metrics.py
CHANGED
|
@@ -55,7 +55,7 @@ verifier_failures_total = Counter(
|
|
| 55 |
repair_attempts_total = Counter(
|
| 56 |
"repair_attempts_total",
|
| 57 |
"Number of repair loop attempts",
|
| 58 |
-
["outcome"], # success | failed
|
| 59 |
registry=REGISTRY,
|
| 60 |
)
|
| 61 |
|
|
|
|
| 55 |
repair_attempts_total = Counter(
|
| 56 |
"repair_attempts_total",
|
| 57 |
"Number of repair loop attempts",
|
| 58 |
+
["outcome"], # attempt | success | failed
|
| 59 |
registry=REGISTRY,
|
| 60 |
)
|
| 61 |
|
nl2sql/pipeline.py
CHANGED
|
@@ -305,6 +305,7 @@ class Pipeline:
|
|
| 305 |
if not verified:
|
| 306 |
for _attempt in range(2):
|
| 307 |
# repair
|
|
|
|
| 308 |
t0 = time.perf_counter()
|
| 309 |
r_fix = self._safe_stage(
|
| 310 |
self.repair.run,
|
|
|
|
| 305 |
if not verified:
|
| 306 |
for _attempt in range(2):
|
| 307 |
# repair
|
| 308 |
+
repair_attempts_total.labels(outcome="attempt").inc()
|
| 309 |
t0 = time.perf_counter()
|
| 310 |
r_fix = self._safe_stage(
|
| 311 |
self.repair.run,
|
prometheus/rules.yml
CHANGED
|
@@ -23,11 +23,8 @@ groups:
|
|
| 23 |
# repair success rate (0..1)
|
| 24 |
- record: nl2sql:repair_success_rate
|
| 25 |
expr: |
|
| 26 |
-
|
| 27 |
-
sum(
|
| 28 |
-
)
|
| 29 |
-
/
|
| 30 |
-
clamp_min(sum(rate(repair_attempts_total[5m])), 1)
|
| 31 |
|
| 32 |
# cache hit ratio (0..1)
|
| 33 |
- record: nl2sql:cache_hit_ratio
|
|
|
|
| 23 |
# repair success rate (0..1)
|
| 24 |
- record: nl2sql:repair_success_rate
|
| 25 |
expr: |
|
| 26 |
+
(sum(increase(repair_attempts_total{outcome="success"}[30m]))) /
|
| 27 |
+
clamp_min(sum(increase(repair_attempts_total[30m])), 1)
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# cache hit ratio (0..1)
|
| 30 |
- record: nl2sql:cache_hit_ratio
|