nl2sql-copilot / nl2sql /metrics.py
Melika Kheirieh
feat(metrics): initialize all counters with zero and extend Prometheus rules for full Grafana coverage
f89e294
raw
history blame
3.73 kB
from prometheus_client import Counter, Histogram
from nl2sql.prom import REGISTRY
# -----------------------------------------------------------------------------
# Stage-level metrics
# -----------------------------------------------------------------------------
stage_duration_ms = Histogram(
"stage_duration_ms",
"Duration (ms) of each pipeline stage",
["stage"], # e.g. detector|planner|generator|safety|verifier
buckets=(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000),
registry=REGISTRY,
)
# -----------------------------------------------------------------------------
# Safety stage metrics
# -----------------------------------------------------------------------------
safety_blocks_total = Counter(
"safety_blocks_total",
"Count of blocked SQL queries by safety checks",
[
"reason"
], # e.g. forbidden_keyword, multiple_statements, non_readonly, explain_not_allowed
registry=REGISTRY,
)
safety_checks_total = Counter(
"safety_checks_total",
"Total SQL queries checked by safety",
["ok"], # "true" or "false"
registry=REGISTRY,
)
# -----------------------------------------------------------------------------
# Verifier stage metrics
# -----------------------------------------------------------------------------
verifier_checks_total = Counter(
"verifier_checks_total",
"Count of verifier checks (success/failure)",
["ok"], # "true" | "false"
registry=REGISTRY,
)
verifier_failures_total = Counter(
"verifier_failures_total",
"Count of verifier failures by type",
["reason"], # e.g. parse_error, semantic_check_error, adapter_failure
registry=REGISTRY,
)
# -----------------------------------------------------------------------------
# Repair stage metrics
# -----------------------------------------------------------------------------
repair_attempts_total = Counter(
"repair_attempts_total",
"Number of repair loop attempts",
["outcome"], # attempt | success | failed
registry=REGISTRY,
)
# -----------------------------------------------------------------------------
# Pipeline-level metrics
# -----------------------------------------------------------------------------
pipeline_runs_total = Counter(
"pipeline_runs_total",
"Total number of full pipeline runs",
["status"], # ok | error | ambiguous
registry=REGISTRY,
)
# -----------------------------------------------------------------------------
# Cache metrics (optional)
# -----------------------------------------------------------------------------
cache_events_total = Counter(
"cache_events_total",
"Cache hit/miss events in the pipeline",
["hit"], # "true" | "false"
registry=REGISTRY,
)
# -----------------------------------------------------------------------------
# Prime all counters with zero to ensure Grafana panels always have data
# -----------------------------------------------------------------------------
for reason in (
"forbidden_keyword",
"multiple_statements",
"non_readonly",
"explain_not_allowed",
"parse_error",
"semantic_check_error",
"adapter_failure",
"unsafe-sql",
"malformed-sql",
"unknown",
):
safety_blocks_total.labels(reason=reason).inc(0)
verifier_failures_total.labels(reason=reason).inc(0)
for ok in ("true", "false"):
safety_checks_total.labels(ok=ok).inc(0)
verifier_checks_total.labels(ok=ok).inc(0)
for outcome in ("attempt", "success", "failed"):
repair_attempts_total.labels(outcome=outcome).inc(0)
for status in ("ok", "error", "ambiguous"):
pipeline_runs_total.labels(status=status).inc(0)
for hit in ("true", "false"):
cache_events_total.labels(hit=hit).inc(0)