Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

Melika Kheirieh

feat(metrics): initialize all counters with zero and extend Prometheus rules for full Grafana coverage

f89e294 about 1 month ago

3.73 kB

	from prometheus_client import Counter, Histogram
	from nl2sql.prom import REGISTRY


	# -----------------------------------------------------------------------------
	# Stage-level metrics
	# -----------------------------------------------------------------------------
	stage_duration_ms = Histogram(
	"stage_duration_ms",
	"Duration (ms) of each pipeline stage",
	["stage"], # e.g. detector\|planner\|generator\|safety\|verifier
	buckets=(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000),
	registry=REGISTRY,
	)

	# -----------------------------------------------------------------------------
	# Safety stage metrics
	# -----------------------------------------------------------------------------
	safety_blocks_total = Counter(
	"safety_blocks_total",
	"Count of blocked SQL queries by safety checks",
	[
	"reason"
	], # e.g. forbidden_keyword, multiple_statements, non_readonly, explain_not_allowed
	registry=REGISTRY,
	)

	safety_checks_total = Counter(
	"safety_checks_total",
	"Total SQL queries checked by safety",
	["ok"], # "true" or "false"
	registry=REGISTRY,
	)

	# -----------------------------------------------------------------------------
	# Verifier stage metrics
	# -----------------------------------------------------------------------------
	verifier_checks_total = Counter(
	"verifier_checks_total",
	"Count of verifier checks (success/failure)",
	["ok"], # "true" \| "false"
	registry=REGISTRY,
	)

	verifier_failures_total = Counter(
	"verifier_failures_total",
	"Count of verifier failures by type",
	["reason"], # e.g. parse_error, semantic_check_error, adapter_failure
	registry=REGISTRY,
	)

	# -----------------------------------------------------------------------------
	# Repair stage metrics
	# -----------------------------------------------------------------------------
	repair_attempts_total = Counter(
	"repair_attempts_total",
	"Number of repair loop attempts",
	["outcome"], # attempt \| success \| failed
	registry=REGISTRY,
	)

	# -----------------------------------------------------------------------------
	# Pipeline-level metrics
	# -----------------------------------------------------------------------------
	pipeline_runs_total = Counter(
	"pipeline_runs_total",
	"Total number of full pipeline runs",
	["status"], # ok \| error \| ambiguous
	registry=REGISTRY,
	)

	# -----------------------------------------------------------------------------
	# Cache metrics (optional)
	# -----------------------------------------------------------------------------
	cache_events_total = Counter(
	"cache_events_total",
	"Cache hit/miss events in the pipeline",
	["hit"], # "true" \| "false"
	registry=REGISTRY,
	)

	# -----------------------------------------------------------------------------
	# Prime all counters with zero to ensure Grafana panels always have data
	# -----------------------------------------------------------------------------
	for reason in (
	"forbidden_keyword",
	"multiple_statements",
	"non_readonly",
	"explain_not_allowed",
	"parse_error",
	"semantic_check_error",
	"adapter_failure",
	"unsafe-sql",
	"malformed-sql",
	"unknown",
	):
	safety_blocks_total.labels(reason=reason).inc(0)
	verifier_failures_total.labels(reason=reason).inc(0)

	for ok in ("true", "false"):
	safety_checks_total.labels(ok=ok).inc(0)
	verifier_checks_total.labels(ok=ok).inc(0)

	for outcome in ("attempt", "success", "failed"):
	repair_attempts_total.labels(outcome=outcome).inc(0)

	for status in ("ok", "error", "ambiguous"):
	pipeline_runs_total.labels(status=status).inc(0)

	for hit in ("true", "false"):
	cache_events_total.labels(hit=hit).inc(0)