Spaces:
Sleeping
Sleeping
Melika Kheirieh
feat(metrics): initialize all counters with zero and extend Prometheus rules for full Grafana coverage
f89e294
| groups: | |
| # 1) Recording rules (all derived metric calculations) | |
| - name: nl2sql_derived | |
| interval: 15s | |
| rules: | |
| # p95 latency per stage (ms) | |
| - record: nl2sql:stage_p95_ms | |
| expr: | | |
| histogram_quantile( | |
| 0.95, | |
| sum by (le, stage) (rate(stage_duration_ms_bucket[5m])) | |
| ) * 1000 | |
| # pipeline success ratio | |
| - record: nl2sql:pipeline_success_ratio | |
| expr: | | |
| ( | |
| sum(rate(pipeline_runs_total{status="ok"}[5m])) | |
| ) | |
| / | |
| clamp_min(sum(rate(pipeline_runs_total[5m])), 1) | |
| # repair success rate | |
| - record: nl2sql:repair_success_rate | |
| expr: | | |
| ( | |
| sum(rate(repair_attempts_total{outcome="success"}[5m])) | |
| ) | |
| / | |
| clamp_min(sum(rate(repair_attempts_total{outcome="attempt"}[5m])), 1) | |
| # cache hit ratio | |
| - record: nl2sql:cache_hit_ratio | |
| expr: | | |
| ( | |
| sum(rate(cache_events_total{hit="true"}[5m])) | |
| ) | |
| / | |
| clamp_min(sum(rate(cache_events_total[5m])), 1) | |
| # verifier events per minute (split by ok) | |
| - record: nl2sql:verifier_events_per_min | |
| expr: | | |
| sum by (ok) (rate(verifier_checks_total[1m])) | |
| # safety blocks per minute | |
| - record: nl2sql:safety_blocks_per_min | |
| expr: | | |
| sum(rate(safety_blocks_total[1m])) | |
| # combined safety + verifier failures per minute | |
| - record: nl2sql:safety_verifier_events_per_min | |
| expr: | | |
| ( | |
| sum(rate(safety_blocks_total[1m])) | |
| + | |
| sum(rate(verifier_failures_total[1m])) | |
| ) | |
| # 2) Alerts (must come after recording rules) | |
| - name: nl2sql_alerts | |
| rules: | |
| # Success ratio < 90% for 10 minutes | |
| - alert: PipelineLowSuccessRatio | |
| expr: nl2sql:pipeline_success_ratio < 0.9 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: "Pipeline success ratio dropped" | |
| description: "Success ratio < 90% over the past 10 minutes" | |
| # Generator p95 latency > 1.5s for 5 minutes | |
| - alert: GeneratorLatencyHigh | |
| expr: nl2sql:stage_p95_ms{stage="generator"} > 1500 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: "Generator p95 latency high" | |
| description: "Generator p95 > 1.5s for 5 minutes" | |
| # Safety blocks spike — per minute (not per second) | |
| - alert: SafetyBlocksSpike | |
| expr: rate(safety_blocks_total[5m]) * 60 > 0.5 | |
| for: 5m | |
| labels: | |
| severity: info | |
| annotations: | |
| summary: "Unusual Safety block rate" | |
| description: "Safety blocks > 0.5 per minute (5m window)" | |