Melika Kheirieh commited on
Commit
5e6809d
·
1 Parent(s): 2e3e9b8

feat(observability): add Prometheus-Grafana stack with auto-provisioning and docs

Browse files
Makefile CHANGED
@@ -109,12 +109,14 @@ clean: ## Remove Python caches
109
  clean-all: clean ## Remove build artifacts and coverage
110
  rm -rf dist build .coverage *.egg-info
111
 
112
- # ---------- Metrics ----------
113
- .PHONY: prom-up prom-check smoke
114
 
 
115
  prom-up:
116
  docker compose -f docker-compose.prom.yml up -d
117
 
 
118
  prom-check:
119
  @if command -v promtool >/dev/null 2>&1; then \
120
  echo "🔍 Running promtool locally..."; \
@@ -127,5 +129,93 @@ prom-check:
127
  promtool check config /etc/prometheus/prometheus.yml; \
128
  fi
129
 
 
130
  smoke:
131
  ./scripts/smoke_metrics.sh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  clean-all: clean ## Remove build artifacts and coverage
110
  rm -rf dist build .coverage *.egg-info
111
 
112
+ # ---------- Observability Stack ----------
113
+ .PHONY: obs-up obs-down obs-logs prom-up prom-check smoke
114
 
115
+ # Bring up Prometheus + Grafana via Docker Compose
116
  prom-up:
117
  docker compose -f docker-compose.prom.yml up -d
118
 
119
+ # Validate Prometheus configs (fallback to Docker if promtool is missing)
120
  prom-check:
121
  @if command -v promtool >/dev/null 2>&1; then \
122
  echo "🔍 Running promtool locally..."; \
 
129
  promtool check config /etc/prometheus/prometheus.yml; \
130
  fi
131
 
132
+ # Generate sample traffic and print key metrics snapshot
133
  smoke:
134
  ./scripts/smoke_metrics.sh
135
+
136
+ # Bring up the stack, wait until services are ready, then run smoke
137
+ obs-up:
138
+ @set -e; \
139
+ \
140
+ # 1) Up the stack
141
+ $(MAKE) prom-up; \
142
+ \
143
+ # 2) Wait for Prometheus readiness
144
+ # - Tries the /-/ready endpoint (preferred). Falls back to port check.
145
+ # - Times out after ~90s.
146
+ echo "⏳ Waiting for Prometheus (http://localhost:9090) ..."; \
147
+ for i in $$(seq 1 30); do \
148
+ # Check readiness endpoint
149
+ if curl -fsS http://localhost:9090/-/ready >/dev/null 2>&1; then \
150
+ echo "✅ Prometheus is ready"; \
151
+ break; \
152
+ fi; \
153
+ # Fallback: check TCP port if /-/ready is not enabled
154
+ if nc -z localhost 9090 >/dev/null 2>&1; then \
155
+ echo "✅ Prometheus port is open (assuming ready)"; \
156
+ break; \
157
+ fi; \
158
+ sleep 3; \
159
+ if [ $$i -eq 30 ]; then \
160
+ echo "❌ Prometheus did not become ready in time"; \
161
+ exit 1; \
162
+ fi; \
163
+ done; \
164
+ \
165
+ # 3) Wait for Grafana login page
166
+ # - Checks that /login returns HTTP 200/302.
167
+ echo "⏳ Waiting for Grafana (http://localhost:3000) ..."; \
168
+ for i in $$(seq 1 30); do \
169
+ code=$$(curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/login || true); \
170
+ if [ "$$code" = "200" ] || [ "$$code" = "302" ]; then \
171
+ echo "✅ Grafana is up"; \
172
+ break; \
173
+ fi; \
174
+ sleep 3; \
175
+ if [ $$i -eq 30 ]; then \
176
+ echo "❌ Grafana did not become ready in time"; \
177
+ exit 1; \
178
+ fi; \
179
+ done; \
180
+ \
181
+ # 4) Run smoke to populate metrics
182
+ echo "🚀 Running smoke traffic ..."; \
183
+ $(MAKE) smoke; \
184
+ echo "🎉 Observability stack is live. Open: Prometheus → http://localhost:9090 , Grafana → http://localhost:3000"
185
+ # 5) Auto-import Grafana dashboard
186
+ $(MAKE) grafana-import
187
+
188
+ # Tear down the observability stack
189
+ obs-down:
190
+ docker compose -f docker-compose.prom.yml down
191
+
192
+ # Tail logs of both services
193
+ obs-logs:
194
+ docker compose -f docker-compose.prom.yml logs -f
195
+
196
+ # ---------- Grafana Auto Import ----------
197
+ .PHONY: grafana-import
198
+
199
+ # Import dashboard JSON into Grafana via HTTP API
200
+ grafana-import:
201
+ @set -e; \
202
+ echo "⏳ Waiting for Grafana API to become ready..."; \
203
+ for i in $$(seq 1 30); do \
204
+ code=$$(curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/api/health || true); \
205
+ if [ "$$code" = "200" ]; then \
206
+ echo "✅ Grafana API is ready"; \
207
+ break; \
208
+ fi; \
209
+ sleep 3; \
210
+ if [ $$i -eq 30 ]; then \
211
+ echo "❌ Grafana API did not become ready in time"; \
212
+ exit 1; \
213
+ fi; \
214
+ done; \
215
+ \
216
+ echo "📦 Importing dashboard ..."; \
217
+ curl -s -X POST http://admin:admin@localhost:3000/api/dashboards/db \
218
+ -H "Content-Type: application/json" \
219
+ -d "{\"dashboard\": $$(cat prometheus/grafana_dashboard.json), \"overwrite\": true, \"folderId\": 0}" \
220
+ | jq -r '.status' || true; \
221
+ echo "🎉 Dashboard imported → http://localhost:3000/dashboards"
docker-compose.prom.yml CHANGED
@@ -1,4 +1,5 @@
1
  version: "3.8"
 
2
  services:
3
  prometheus:
4
  image: prom/prometheus:v2.55.0
@@ -9,5 +10,32 @@ services:
9
  - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
10
  - ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
11
  ports:
12
- - "9090:9090"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  restart: unless-stopped
 
1
  version: "3.8"
2
+
3
  services:
4
  prometheus:
5
  image: prom/prometheus:v2.55.0
 
10
  - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
11
  - ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
12
  ports:
13
+ - "9090:9090" # Prometheus UI
14
+ networks:
15
+ - observability
16
+ restart: unless-stopped
17
+
18
+ grafana:
19
+ image: grafana/grafana:latest
20
+ container_name: nl2sql-grafana
21
+ ports:
22
+ - "3000:3000" # Grafana UI
23
+ depends_on:
24
+ - prometheus
25
+ environment:
26
+ - GF_SECURITY_ADMIN_USER=admin
27
+ - GF_SECURITY_ADMIN_PASSWORD=admin
28
+ - GF_USERS_ALLOW_SIGN_UP=false
29
+ - GF_AUTH_ANONYMOUS_ENABLED=false
30
+ # Optional hardening:
31
+ - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/etc/grafana/provisioning/dashboards/nl2sql.json
32
+ volumes:
33
+ # Provisioning (datasource + dashboard providers)
34
+ - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
35
+ - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
36
+
37
+ # The actual dashboard JSON (mounted into the same dashboards dir)
38
+ - ./prometheus/grafana_dashboard.json:/etc/grafana/provisioning/dashboards/nl2sql.json:ro
39
+ networks:
40
+ - observability
41
  restart: unless-stopped
docs/observability.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Observability and Metrics
2
+
3
+ This module adds full observability for the NL2SQL Copilot pipeline.
4
+
5
+ ## 📊 Metrics exposed
6
+
7
+ | Metric | Type | Labels | Description |
8
+ |--------|------|---------|--------------|
9
+ | `stage_duration_ms` | histogram | `stage` | Duration per stage (detector, planner, generator, safety, executor, verifier) |
10
+ | `pipeline_runs_total` | counter | `status` | Pipeline runs by outcome (`ok`, `error`, `ambiguous`) |
11
+ | `safety_checks_total`, `safety_blocks_total` | counter | `reason` | Number of safety checks and blocked queries |
12
+ | `verifier_checks_total`, `verifier_failures_total` | counter | `reason` | Number of verification passes and failures |
13
+
14
+ ---
15
+
16
+ ## ⚙️ Recording & Alerting Rules
17
+
18
+ Defined in `prometheus/rules.yml`:
19
+
20
+ - **`nl2sql:stage_p95_ms`** – 95th percentile latency per stage
21
+ - **`nl2sql:pipeline_success_ratio`** – 5-minute success ratio
22
+ - Alerts:
23
+ - `PipelineLowSuccessRatio` (<90% for 10m)
24
+ - `GeneratorLatencyHigh` (>1500 ms for 5m)
25
+ - `SafetyBlocksSpike` (>0.5/min)
26
+
27
+ ---
28
+
29
+ ## 🧪 Local Testing
30
+
31
+ 1. Start Prometheus
32
+ ```bash
33
+ make prom-up
grafana/provisioning/dashboards/dashboard.yml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ providers:
4
+ - name: "NL2SQL Dashboards"
5
+ orgId: 1
6
+ folder: ""
7
+ type: file
8
+ disableDeletion: true # Prevent dashboard deletion from UI
9
+ editable: false
10
+ updateIntervalSeconds: 10 # How often Grafana scans for changes
11
+ options:
12
+ path: /etc/grafana/provisioning/dashboards
13
+ # Grafana will load any *.json from this directory
grafana/provisioning/datasources/datasource.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ datasources:
4
+ - name: Prometheus
5
+ type: prometheus
6
+ access: proxy # Grafana will proxy queries
7
+ url: http://prometheus:9090
8
+ isDefault: true
9
+ editable: false
10
+ jsonData:
11
+ httpMethod: GET # Use GET for Prometheus
prometheus/grafana_dashboard.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "NL2SQL Copilot - Observability",
3
+ "editable": true,
4
+ "panels": [
5
+ {
6
+ "type": "timeseries",
7
+ "title": "Stage p95 Latency (ms)",
8
+ "targets": [
9
+ {
10
+ "expr": "nl2sql:stage_p95_ms",
11
+ "legendFormat": "{{stage}}",
12
+ "refId": "A"
13
+ }
14
+ ],
15
+ "fieldConfig": {
16
+ "defaults": {
17
+ "unit": "milliseconds",
18
+ "decimals": 0
19
+ }
20
+ },
21
+ "id": 1
22
+ },
23
+ {
24
+ "type": "timeseries",
25
+ "title": "Pipeline Success Ratio",
26
+ "targets": [
27
+ {
28
+ "expr": "nl2sql:pipeline_success_ratio",
29
+ "legendFormat": "success ratio",
30
+ "refId": "B"
31
+ }
32
+ ],
33
+ "fieldConfig": {
34
+ "defaults": {
35
+ "min": 0,
36
+ "max": 1,
37
+ "decimals": 2
38
+ }
39
+ },
40
+ "id": 2
41
+ },
42
+ {
43
+ "type": "timeseries",
44
+ "title": "Safety & Verifier Events",
45
+ "targets": [
46
+ {
47
+ "expr": "rate(safety_blocks_total[5m])",
48
+ "legendFormat": "safety blocks/min",
49
+ "refId": "C"
50
+ },
51
+ {
52
+ "expr": "rate(verifier_failures_total[5m])",
53
+ "legendFormat": "verifier failures/min",
54
+ "refId": "D"
55
+ }
56
+ ],
57
+ "fieldConfig": {
58
+ "defaults": {
59
+ "min": 0
60
+ }
61
+ },
62
+ "id": 3
63
+ }
64
+ ],
65
+ "schemaVersion": 38,
66
+ "version": 1,
67
+ "refresh": "30s"
68
+ }