Melika Kheirieh
feat(metrics): initialize all counters with zero and extend Prometheus rules for full Grafana coverage
f89e294
raw
history blame
2.76 kB
groups:
# 1) Recording rules (all derived metric calculations)
- name: nl2sql_derived
interval: 15s
rules:
# p95 latency per stage (ms)
- record: nl2sql:stage_p95_ms
expr: |
histogram_quantile(
0.95,
sum by (le, stage) (rate(stage_duration_ms_bucket[5m]))
) * 1000
# pipeline success ratio
- record: nl2sql:pipeline_success_ratio
expr: |
(
sum(rate(pipeline_runs_total{status="ok"}[5m]))
)
/
clamp_min(sum(rate(pipeline_runs_total[5m])), 1)
# repair success rate
- record: nl2sql:repair_success_rate
expr: |
(
sum(rate(repair_attempts_total{outcome="success"}[5m]))
)
/
clamp_min(sum(rate(repair_attempts_total{outcome="attempt"}[5m])), 1)
# cache hit ratio
- record: nl2sql:cache_hit_ratio
expr: |
(
sum(rate(cache_events_total{hit="true"}[5m]))
)
/
clamp_min(sum(rate(cache_events_total[5m])), 1)
# verifier events per minute (split by ok)
- record: nl2sql:verifier_events_per_min
expr: |
sum by (ok) (rate(verifier_checks_total[1m]))
# safety blocks per minute
- record: nl2sql:safety_blocks_per_min
expr: |
sum(rate(safety_blocks_total[1m]))
# combined safety + verifier failures per minute
- record: nl2sql:safety_verifier_events_per_min
expr: |
(
sum(rate(safety_blocks_total[1m]))
+
sum(rate(verifier_failures_total[1m]))
)
# 2) Alerts (must come after recording rules)
- name: nl2sql_alerts
rules:
# Success ratio < 90% for 10 minutes
- alert: PipelineLowSuccessRatio
expr: nl2sql:pipeline_success_ratio < 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "Pipeline success ratio dropped"
description: "Success ratio < 90% over the past 10 minutes"
# Generator p95 latency > 1.5s for 5 minutes
- alert: GeneratorLatencyHigh
expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
for: 5m
labels:
severity: warning
annotations:
summary: "Generator p95 latency high"
description: "Generator p95 > 1.5s for 5 minutes"
# Safety blocks spike — per minute (not per second)
- alert: SafetyBlocksSpike
expr: rate(safety_blocks_total[5m]) * 60 > 0.5
for: 5m
labels:
severity: info
annotations:
summary: "Unusual Safety block rate"
description: "Safety blocks > 0.5 per minute (5m window)"