File size: 3,734 Bytes
c24bfe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c4cb86
 
 
 
 
 
666306b
1c4cb86
 
 
c24bfe8
 
 
 
 
 
 
 
 
f89e294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from prometheus_client import Counter, Histogram
from nl2sql.prom import REGISTRY


# -----------------------------------------------------------------------------
#  Stage-level metrics
# -----------------------------------------------------------------------------
stage_duration_ms = Histogram(
    "stage_duration_ms",
    "Duration (ms) of each pipeline stage",
    ["stage"],  # e.g. detector|planner|generator|safety|verifier
    buckets=(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000),
    registry=REGISTRY,
)

# -----------------------------------------------------------------------------
#  Safety stage metrics
# -----------------------------------------------------------------------------
safety_blocks_total = Counter(
    "safety_blocks_total",
    "Count of blocked SQL queries by safety checks",
    [
        "reason"
    ],  # e.g. forbidden_keyword, multiple_statements, non_readonly, explain_not_allowed
    registry=REGISTRY,
)

safety_checks_total = Counter(
    "safety_checks_total",
    "Total SQL queries checked by safety",
    ["ok"],  # "true" or "false"
    registry=REGISTRY,
)

# -----------------------------------------------------------------------------
#  Verifier stage metrics
# -----------------------------------------------------------------------------
verifier_checks_total = Counter(
    "verifier_checks_total",
    "Count of verifier checks (success/failure)",
    ["ok"],  # "true" | "false"
    registry=REGISTRY,
)

verifier_failures_total = Counter(
    "verifier_failures_total",
    "Count of verifier failures by type",
    ["reason"],  # e.g. parse_error, semantic_check_error, adapter_failure
    registry=REGISTRY,
)

# -----------------------------------------------------------------------------
#  Repair stage metrics
# -----------------------------------------------------------------------------
repair_attempts_total = Counter(
    "repair_attempts_total",
    "Number of repair loop attempts",
    ["outcome"],  # attempt | success | failed
    registry=REGISTRY,
)

# -----------------------------------------------------------------------------
#  Pipeline-level metrics
# -----------------------------------------------------------------------------
pipeline_runs_total = Counter(
    "pipeline_runs_total",
    "Total number of full pipeline runs",
    ["status"],  # ok | error | ambiguous
    registry=REGISTRY,
)

# -----------------------------------------------------------------------------
#  Cache metrics (optional)
# -----------------------------------------------------------------------------
cache_events_total = Counter(
    "cache_events_total",
    "Cache hit/miss events in the pipeline",
    ["hit"],  # "true" | "false"
    registry=REGISTRY,
)

# -----------------------------------------------------------------------------
#  Prime all counters with zero to ensure Grafana panels always have data
# -----------------------------------------------------------------------------
for reason in (
    "forbidden_keyword",
    "multiple_statements",
    "non_readonly",
    "explain_not_allowed",
    "parse_error",
    "semantic_check_error",
    "adapter_failure",
    "unsafe-sql",
    "malformed-sql",
    "unknown",
):
    safety_blocks_total.labels(reason=reason).inc(0)
    verifier_failures_total.labels(reason=reason).inc(0)

for ok in ("true", "false"):
    safety_checks_total.labels(ok=ok).inc(0)
    verifier_checks_total.labels(ok=ok).inc(0)

for outcome in ("attempt", "success", "failed"):
    repair_attempts_total.labels(outcome=outcome).inc(0)

for status in ("ok", "error", "ambiguous"):
    pipeline_runs_total.labels(status=status).inc(0)

for hit in ("true", "false"):
    cache_events_total.labels(hit=hit).inc(0)