Melika Kheirieh commited on
Commit
c4b6bf9
·
1 Parent(s): 4c2cf14

feat(observability): add refined Prometheus rules and Grafana dashboard polish

Browse files
grafana/provisioning/dashboards/nl2sql.json CHANGED
@@ -1,68 +1,143 @@
1
  {
2
- "title": "NL2SQL Copilot - Observability",
 
 
3
  "editable": true,
 
 
 
4
  "panels": [
5
  {
 
6
  "type": "timeseries",
7
  "title": "Stage p95 Latency (ms)",
 
8
  "targets": [
9
- {
10
- "expr": "nl2sql:stage_p95_ms",
11
- "legendFormat": "{{stage}}",
12
- "refId": "A"
13
- }
14
  ],
15
  "fieldConfig": {
16
  "defaults": {
17
- "unit": "milliseconds",
18
- "decimals": 0
19
- }
 
 
 
 
 
 
 
 
 
20
  },
21
- "id": 1
22
  },
23
  {
24
- "type": "timeseries",
25
- "title": "Pipeline Success Ratio",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  "targets": [
27
- {
28
- "expr": "nl2sql:pipeline_success_ratio",
29
- "legendFormat": "success ratio",
30
- "refId": "B"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
 
 
 
 
 
 
 
 
 
 
32
  ],
33
  "fieldConfig": {
34
  "defaults": {
 
35
  "min": 0,
36
- "max": 1,
37
- "decimals": 2
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  },
40
- "id": 2
41
  },
42
  {
 
43
  "type": "timeseries",
44
- "title": "Safety & Verifier Events",
 
45
  "targets": [
46
- {
47
- "expr": "rate(safety_blocks_total[5m])",
48
- "legendFormat": "safety blocks/min",
49
- "refId": "C"
50
- },
51
- {
52
- "expr": "rate(verifier_failures_total[5m])",
53
- "legendFormat": "verifier failures/min",
54
- "refId": "D"
55
- }
56
  ],
57
  "fieldConfig": {
58
- "defaults": {
59
- "min": 0
60
- }
61
  },
62
- "id": 3
63
  }
64
- ],
65
- "schemaVersion": 38,
66
- "version": 1,
67
- "refresh": "30s"
68
  }
 
1
  {
2
+ "title": "NL2SQL Copilot Unified Observability",
3
+ "uid": "nl2sql-unified",
4
+ "timezone": "browser",
5
  "editable": true,
6
+ "schemaVersion": 38,
7
+ "version": 1,
8
+ "refresh": "30s",
9
  "panels": [
10
  {
11
+ "id": 1,
12
  "type": "timeseries",
13
  "title": "Stage p95 Latency (ms)",
14
+ "datasource": "Prometheus",
15
  "targets": [
16
+ { "expr": "nl2sql:stage_p95_ms", "legendFormat": "{{stage}}", "refId": "A" }
 
 
 
 
17
  ],
18
  "fieldConfig": {
19
  "defaults": {
20
+ "unit": "ms",
21
+ "decimals": 0,
22
+ "thresholds": {
23
+ "mode": "absolute",
24
+ "steps": [
25
+ { "color": "green" },
26
+ { "value": 1000, "color": "orange" },
27
+ { "value": 2000, "color": "red" }
28
+ ]
29
+ }
30
+ },
31
+ "overrides": []
32
  },
33
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 6 }
34
  },
35
  {
36
+ "id": 2,
37
+ "type": "stat",
38
+ "title": "Pipeline OK Ratio (%)",
39
+ "datasource": "Prometheus",
40
+ "targets": [
41
+ { "expr": "nl2sql:pipeline_success_ratio * 100", "refId": "A" }
42
+ ],
43
+ "options": {
44
+ "reduceOptions": { "calcs": ["lastNotNull"] },
45
+ "colorMode": "value",
46
+ "graphMode": "none",
47
+ "justifyMode": "center"
48
+ },
49
+ "fieldConfig": {
50
+ "defaults": {
51
+ "unit": "percent",
52
+ "decimals": 1,
53
+ "thresholds": {
54
+ "mode": "absolute",
55
+ "steps": [
56
+ { "color": "red" },
57
+ { "value": 90, "color": "orange" },
58
+ { "value": 97, "color": "green" }
59
+ ]
60
+ }
61
+ },
62
+ "overrides": []
63
+ },
64
+ "gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 }
65
+ },
66
+ {
67
+ "id": 3,
68
+ "type": "stat",
69
+ "title": "Repair Success Rate (%)",
70
+ "datasource": "Prometheus",
71
  "targets": [
72
+ { "expr": "nl2sql:repair_success_rate * 100", "refId": "A" }
73
+ ],
74
+ "options": {
75
+ "reduceOptions": { "calcs": ["lastNotNull"] },
76
+ "colorMode": "value",
77
+ "graphMode": "none",
78
+ "justifyMode": "center"
79
+ },
80
+ "fieldConfig": {
81
+ "defaults": {
82
+ "unit": "percent",
83
+ "decimals": 1,
84
+ "thresholds": {
85
+ "mode": "absolute",
86
+ "steps": [
87
+ { "color": "red" },
88
+ { "value": 80, "color": "orange" },
89
+ { "value": 95, "color": "green" }
90
+ ]
91
+ }
92
  }
93
+ },
94
+ "gridPos": { "x": 12, "y": 4, "w": 6, "h": 4 }
95
+ },
96
+ {
97
+ "id": 4,
98
+ "type": "gauge",
99
+ "title": "Cache Hit Ratio",
100
+ "datasource": "Prometheus",
101
+ "targets": [
102
+ { "expr": "nl2sql:cache_hit_ratio * 100", "refId": "A" }
103
  ],
104
  "fieldConfig": {
105
  "defaults": {
106
+ "unit": "percent",
107
  "min": 0,
108
+ "max": 100,
109
+ "thresholds": {
110
+ "mode": "absolute",
111
+ "steps": [
112
+ { "color": "red" },
113
+ { "value": 70, "color": "orange" },
114
+ { "value": 90, "color": "green" }
115
+ ]
116
+ }
117
+ },
118
+ "overrides": []
119
+ },
120
+ "options": {
121
+ "orientation": "auto",
122
+ "reduceOptions": { "calcs": ["lastNotNull"] },
123
+ "showThresholdLabels": false,
124
+ "showThresholdMarkers": true
125
  },
126
+ "gridPos": { "x": 0, "y": 6, "w": 6, "h": 4 }
127
  },
128
  {
129
+ "id": 5,
130
  "type": "timeseries",
131
+ "title": "Safety & Verifier Events (per min)",
132
+ "datasource": "Prometheus",
133
  "targets": [
134
+ { "expr": "rate(safety_blocks_total[5m]) * 60", "legendFormat": "safety blocks", "refId": "A" },
135
+ { "expr": "rate(verifier_failures_total[5m]) * 60", "legendFormat": "verifier failures", "refId": "B" }
 
 
 
 
 
 
 
 
136
  ],
137
  "fieldConfig": {
138
+ "defaults": { "min": 0, "decimals": 0 }
 
 
139
  },
140
+ "gridPos": { "x": 6, "y": 6, "w": 12, "h": 6 }
141
  }
142
+ ]
 
 
 
143
  }
prometheus/grafana_dashboard.json CHANGED
@@ -1,10 +1,15 @@
1
  {
2
  "title": "NL2SQL Copilot - Observability",
3
  "editable": true,
 
 
 
4
  "panels": [
5
  {
 
6
  "type": "timeseries",
7
  "title": "Stage p95 Latency (ms)",
 
8
  "targets": [
9
  {
10
  "expr": "nl2sql:stage_p95_ms",
@@ -14,55 +19,148 @@
14
  ],
15
  "fieldConfig": {
16
  "defaults": {
17
- "unit": "milliseconds",
18
- "decimals": 0
 
 
 
 
 
 
 
 
19
  }
20
  },
21
- "id": 1
22
  },
23
  {
24
- "type": "timeseries",
25
- "title": "Pipeline Success Ratio",
 
 
26
  "targets": [
27
  {
28
- "expr": "nl2sql:pipeline_success_ratio",
29
- "legendFormat": "success ratio",
30
  "refId": "B"
31
  }
32
  ],
 
 
 
 
 
 
33
  "fieldConfig": {
34
  "defaults": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "min": 0,
36
- "max": 1,
37
- "decimals": 2
 
 
 
 
 
 
 
38
  }
39
  },
40
- "id": 2
 
 
 
 
 
 
41
  },
42
  {
 
43
  "type": "timeseries",
44
  "title": "Safety & Verifier Events",
 
45
  "targets": [
46
  {
47
- "expr": "rate(safety_blocks_total[5m])",
48
  "legendFormat": "safety blocks/min",
49
- "refId": "C"
50
  },
51
  {
52
- "expr": "rate(verifier_failures_total[5m])",
53
  "legendFormat": "verifier failures/min",
54
- "refId": "D"
55
  }
56
  ],
57
  "fieldConfig": {
58
  "defaults": {
59
- "min": 0
 
 
60
  }
61
  },
62
- "id": 3
63
  }
64
- ],
65
- "schemaVersion": 38,
66
- "version": 1,
67
- "refresh": "30s"
68
  }
 
1
  {
2
  "title": "NL2SQL Copilot - Observability",
3
  "editable": true,
4
+ "schemaVersion": 38,
5
+ "version": 2,
6
+ "refresh": "30s",
7
  "panels": [
8
  {
9
+ "id": 1,
10
  "type": "timeseries",
11
  "title": "Stage p95 Latency (ms)",
12
+ "datasource": "Prometheus",
13
  "targets": [
14
  {
15
  "expr": "nl2sql:stage_p95_ms",
 
19
  ],
20
  "fieldConfig": {
21
  "defaults": {
22
+ "unit": "ms",
23
+ "decimals": 0,
24
+ "thresholds": {
25
+ "mode": "absolute",
26
+ "steps": [
27
+ { "color": "green" },
28
+ { "value": 1000, "color": "orange" },
29
+ { "value": 2000, "color": "red" }
30
+ ]
31
+ }
32
  }
33
  },
34
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 6 }
35
  },
36
  {
37
+ "id": 2,
38
+ "type": "stat",
39
+ "title": "Pipeline OK Ratio (%)",
40
+ "datasource": "Prometheus",
41
  "targets": [
42
  {
43
+ "expr": "nl2sql:pipeline_success_ratio * 100",
44
+ "legendFormat": "OK ratio",
45
  "refId": "B"
46
  }
47
  ],
48
+ "options": {
49
+ "reduceOptions": { "calcs": ["lastNotNull"] },
50
+ "colorMode": "value",
51
+ "graphMode": "none",
52
+ "justifyMode": "center"
53
+ },
54
  "fieldConfig": {
55
  "defaults": {
56
+ "unit": "percent",
57
+ "decimals": 1,
58
+ "thresholds": {
59
+ "mode": "absolute",
60
+ "steps": [
61
+ { "color": "red" },
62
+ { "value": 90, "color": "orange" },
63
+ { "value": 97, "color": "green" }
64
+ ]
65
+ }
66
+ }
67
+ },
68
+ "gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 }
69
+ },
70
+ {
71
+ "id": 3,
72
+ "type": "stat",
73
+ "title": "Repair Success Rate (%)",
74
+ "datasource": "Prometheus",
75
+ "targets": [
76
+ {
77
+ "expr": "nl2sql:repair_success_rate * 100",
78
+ "legendFormat": "repair success",
79
+ "refId": "C"
80
+ }
81
+ ],
82
+ "options": {
83
+ "reduceOptions": { "calcs": ["lastNotNull"] },
84
+ "colorMode": "value",
85
+ "graphMode": "none",
86
+ "justifyMode": "center"
87
+ },
88
+ "fieldConfig": {
89
+ "defaults": {
90
+ "unit": "percent",
91
+ "decimals": 1,
92
+ "thresholds": {
93
+ "mode": "absolute",
94
+ "steps": [
95
+ { "color": "red" },
96
+ { "value": 80, "color": "orange" },
97
+ { "value": 95, "color": "green" }
98
+ ]
99
+ }
100
+ }
101
+ },
102
+ "gridPos": { "x": 12, "y": 4, "w": 6, "h": 4 }
103
+ },
104
+ {
105
+ "id": 4,
106
+ "type": "gauge",
107
+ "title": "Cache Hit Ratio",
108
+ "datasource": "Prometheus",
109
+ "targets": [
110
+ {
111
+ "expr": "nl2sql:cache_hit_ratio * 100",
112
+ "legendFormat": "cache hit",
113
+ "refId": "D"
114
+ }
115
+ ],
116
+ "fieldConfig": {
117
+ "defaults": {
118
+ "unit": "percent",
119
  "min": 0,
120
+ "max": 100,
121
+ "thresholds": {
122
+ "mode": "absolute",
123
+ "steps": [
124
+ { "color": "red" },
125
+ { "value": 70, "color": "orange" },
126
+ { "value": 90, "color": "green" }
127
+ ]
128
+ }
129
  }
130
  },
131
+ "options": {
132
+ "orientation": "auto",
133
+ "reduceOptions": { "calcs": ["lastNotNull"] },
134
+ "showThresholdLabels": false,
135
+ "showThresholdMarkers": true
136
+ },
137
+ "gridPos": { "x": 0, "y": 6, "w": 6, "h": 4 }
138
  },
139
  {
140
+ "id": 5,
141
  "type": "timeseries",
142
  "title": "Safety & Verifier Events",
143
+ "datasource": "Prometheus",
144
  "targets": [
145
  {
146
+ "expr": "rate(safety_blocks_total[5m]) * 60",
147
  "legendFormat": "safety blocks/min",
148
+ "refId": "E"
149
  },
150
  {
151
+ "expr": "rate(verifier_failures_total[5m]) * 60",
152
  "legendFormat": "verifier failures/min",
153
+ "refId": "F"
154
  }
155
  ],
156
  "fieldConfig": {
157
  "defaults": {
158
+ "unit": "none",
159
+ "min": 0,
160
+ "decimals": 0
161
  }
162
  },
163
+ "gridPos": { "x": 6, "y": 6, "w": 12, "h": 6 }
164
  }
165
+ ]
 
 
 
166
  }
prometheus/rules.yml CHANGED
@@ -1,49 +1,75 @@
1
  groups:
2
- - name: nl2sql_latency
 
 
3
  rules:
4
- # p95 latency per stage (5-minute window)
5
- - record: nl2sql:stage_p95_ms
6
- expr: |
7
- histogram_quantile(
8
- 0.95,
9
- sum(rate(stage_duration_ms_bucket[5m])) by (le, stage)
10
- )
11
 
12
- # pipeline success ratio (5-minute rolling window)
13
- - record: nl2sql:pipeline_success_ratio
14
- expr: |
15
- sum(rate(pipeline_runs_total{status="ok"}[5m]))
16
- /
17
- sum(rate(pipeline_runs_total[5m]))
 
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  - name: nl2sql_alerts
20
  rules:
21
- # Alert: success ratio below 90% for 10 minutes
22
- - alert: PipelineLowSuccessRatio
23
- expr: nl2sql:pipeline_success_ratio < 0.9
24
- for: 10m
25
- labels:
26
- severity: warning
27
- annotations:
28
- summary: "Pipeline success ratio dropped"
29
- description: "Success ratio < 90% over the past 10 minutes"
30
 
31
- # Alert: high generator p95 latency (>1.5s for 5 minutes)
32
- - alert: GeneratorLatencyHigh
33
- expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
34
- for: 5m
35
- labels:
36
- severity: warning
37
- annotations:
38
- summary: "Generator p95 latency high"
39
- description: "Generator p95 > 1.5s for 5 minutes"
40
 
41
- # Alert: unusual spike in Safety blocks
42
- - alert: SafetyBlocksSpike
43
- expr: rate(safety_blocks_total[5m]) > 0.5
44
- for: 5m
45
- labels:
46
- severity: info
47
- annotations:
48
- summary: "Unusual Safety block rate"
49
- description: "Safety blocks rate > 0.5 per minute (check inputs or safety rules)"
 
1
  groups:
2
+ # 1) Recording rules (all derived metric calculations)
3
+ - name: nl2sql_derived
4
+ interval: 15s
5
  rules:
6
+ # p95 latency per stage (ms) — remove *1000 if histogram buckets are already in milliseconds
7
+ - record: nl2sql:stage_p95_ms
8
+ expr: |
9
+ histogram_quantile(
10
+ 0.95,
11
+ sum by (le, stage) (rate(stage_duration_ms_bucket[5m]))
12
+ ) * 1000
13
 
14
+ # pipeline success ratio (0..1) safe division to avoid divide-by-zero
15
+ - record: nl2sql:pipeline_success_ratio
16
+ expr: |
17
+ (
18
+ sum(rate(pipeline_runs_total{status="ok"}[5m]))
19
+ )
20
+ /
21
+ clamp_min(sum(rate(pipeline_runs_total[5m])), 1)
22
 
23
+ # repair success rate (0..1)
24
+ - record: nl2sql:repair_success_rate
25
+ expr: |
26
+ (
27
+ sum(rate(repair_attempts_total{outcome="success"}[5m]))
28
+ )
29
+ /
30
+ clamp_min(sum(rate(repair_attempts_total[5m])), 1)
31
+
32
+ # cache hit ratio (0..1)
33
+ - record: nl2sql:cache_hit_ratio
34
+ expr: |
35
+ (
36
+ sum(rate(cache_hits_total[5m]))
37
+ )
38
+ /
39
+ clamp_min(
40
+ sum(rate(cache_hits_total[5m])) + sum(rate(cache_misses_total[5m])),
41
+ 1
42
+ )
43
+
44
+ # 2) Alerts (must come after recording rules)
45
  - name: nl2sql_alerts
46
  rules:
47
+ # Success ratio < 90% for 10 minutes
48
+ - alert: PipelineLowSuccessRatio
49
+ expr: nl2sql:pipeline_success_ratio < 0.9
50
+ for: 10m
51
+ labels:
52
+ severity: warning
53
+ annotations:
54
+ summary: "Pipeline success ratio dropped"
55
+ description: "Success ratio < 90% over the past 10 minutes"
56
 
57
+ # Generator p95 latency > 1.5s for 5 minutes
58
+ - alert: GeneratorLatencyHigh
59
+ expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
60
+ for: 5m
61
+ labels:
62
+ severity: warning
63
+ annotations:
64
+ summary: "Generator p95 latency high"
65
+ description: "Generator p95 > 1.5s for 5 minutes"
66
 
67
+ # Safety blocks spike per minute (not per second)
68
+ - alert: SafetyBlocksSpike
69
+ expr: rate(safety_blocks_total[5m]) * 60 > 0.5
70
+ for: 5m
71
+ labels:
72
+ severity: info
73
+ annotations:
74
+ summary: "Unusual Safety block rate"
75
+ description: "Safety blocks > 0.5 per minute (5m window)"
scripts/smoke_metrics.sh CHANGED
@@ -12,16 +12,19 @@ for q in \
12
  do
13
  curl -s -X POST "$API/nl2sql" \
14
  -H 'Content-Type: application/json' \
 
15
  -d "{\"query\":\"$q\"}" >/dev/null || true
16
  done
17
 
18
  # Send queries that trigger safety and verifier checks
19
  curl -s -X POST "$API/nl2sql" \
20
  -H 'Content-Type: application/json' \
 
21
  -d '{"query":"DELETE FROM users;"}' >/dev/null || true
22
 
23
  curl -s -X POST "$API/nl2sql" \
24
  -H 'Content-Type: application/json' \
 
25
  -d '{"query":"SELECT COUNT(*), country FROM customers;"}' >/dev/null || true
26
 
27
  # Print a snapshot of key Prometheus metrics
 
12
  do
13
  curl -s -X POST "$API/nl2sql" \
14
  -H 'Content-Type: application/json' \
15
+ -H 'X-API-Key: dev-key' \
16
  -d "{\"query\":\"$q\"}" >/dev/null || true
17
  done
18
 
19
  # Send queries that trigger safety and verifier checks
20
  curl -s -X POST "$API/nl2sql" \
21
  -H 'Content-Type: application/json' \
22
+ -H 'X-API-Key: dev-key' \
23
  -d '{"query":"DELETE FROM users;"}' >/dev/null || true
24
 
25
  curl -s -X POST "$API/nl2sql" \
26
  -H 'Content-Type: application/json' \
27
+ -H 'X-API-Key: dev-key' \
28
  -d '{"query":"SELECT COUNT(*), country FROM customers;"}' >/dev/null || true
29
 
30
  # Print a snapshot of key Prometheus metrics