Spaces:
Running
Running
Melika Kheirieh
commited on
Commit
·
5e6809d
1
Parent(s):
2e3e9b8
feat(observability): add Prometheus-Grafana stack with auto-provisioning and docs
Browse files- Makefile +92 -2
- docker-compose.prom.yml +29 -1
- docs/observability.md +33 -0
- grafana/provisioning/dashboards/dashboard.yml +13 -0
- grafana/provisioning/datasources/datasource.yml +11 -0
- prometheus/grafana_dashboard.json +68 -0
Makefile
CHANGED
|
@@ -109,12 +109,14 @@ clean: ## Remove Python caches
|
|
| 109 |
clean-all: clean ## Remove build artifacts and coverage
|
| 110 |
rm -rf dist build .coverage *.egg-info
|
| 111 |
|
| 112 |
-
# ----------
|
| 113 |
-
.PHONY: prom-up prom-check smoke
|
| 114 |
|
|
|
|
| 115 |
prom-up:
|
| 116 |
docker compose -f docker-compose.prom.yml up -d
|
| 117 |
|
|
|
|
| 118 |
prom-check:
|
| 119 |
@if command -v promtool >/dev/null 2>&1; then \
|
| 120 |
echo "🔍 Running promtool locally..."; \
|
|
@@ -127,5 +129,93 @@ prom-check:
|
|
| 127 |
promtool check config /etc/prometheus/prometheus.yml; \
|
| 128 |
fi
|
| 129 |
|
|
|
|
| 130 |
smoke:
|
| 131 |
./scripts/smoke_metrics.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
clean-all: clean ## Remove build artifacts and coverage
|
| 110 |
rm -rf dist build .coverage *.egg-info
|
| 111 |
|
| 112 |
+
# ---------- Observability Stack ----------
|
| 113 |
+
.PHONY: obs-up obs-down obs-logs prom-up prom-check smoke
|
| 114 |
|
| 115 |
+
# Bring up Prometheus + Grafana via Docker Compose
|
| 116 |
prom-up:
|
| 117 |
docker compose -f docker-compose.prom.yml up -d
|
| 118 |
|
| 119 |
+
# Validate Prometheus configs (fallback to Docker if promtool is missing)
|
| 120 |
prom-check:
|
| 121 |
@if command -v promtool >/dev/null 2>&1; then \
|
| 122 |
echo "🔍 Running promtool locally..."; \
|
|
|
|
| 129 |
promtool check config /etc/prometheus/prometheus.yml; \
|
| 130 |
fi
|
| 131 |
|
| 132 |
+
# Generate sample traffic and print key metrics snapshot
|
| 133 |
smoke:
|
| 134 |
./scripts/smoke_metrics.sh
|
| 135 |
+
|
| 136 |
+
# Bring up the stack, wait until services are ready, then run smoke
|
| 137 |
+
obs-up:
|
| 138 |
+
@set -e; \
|
| 139 |
+
\
|
| 140 |
+
# 1) Up the stack
|
| 141 |
+
$(MAKE) prom-up; \
|
| 142 |
+
\
|
| 143 |
+
# 2) Wait for Prometheus readiness
|
| 144 |
+
# - Tries the /-/ready endpoint (preferred). Falls back to port check.
|
| 145 |
+
# - Times out after ~90s.
|
| 146 |
+
echo "⏳ Waiting for Prometheus (http://localhost:9090) ..."; \
|
| 147 |
+
for i in $$(seq 1 30); do \
|
| 148 |
+
# Check readiness endpoint
|
| 149 |
+
if curl -fsS http://localhost:9090/-/ready >/dev/null 2>&1; then \
|
| 150 |
+
echo "✅ Prometheus is ready"; \
|
| 151 |
+
break; \
|
| 152 |
+
fi; \
|
| 153 |
+
# Fallback: check TCP port if /-/ready is not enabled
|
| 154 |
+
if nc -z localhost 9090 >/dev/null 2>&1; then \
|
| 155 |
+
echo "✅ Prometheus port is open (assuming ready)"; \
|
| 156 |
+
break; \
|
| 157 |
+
fi; \
|
| 158 |
+
sleep 3; \
|
| 159 |
+
if [ $$i -eq 30 ]; then \
|
| 160 |
+
echo "❌ Prometheus did not become ready in time"; \
|
| 161 |
+
exit 1; \
|
| 162 |
+
fi; \
|
| 163 |
+
done; \
|
| 164 |
+
\
|
| 165 |
+
# 3) Wait for Grafana login page
|
| 166 |
+
# - Checks that /login returns HTTP 200/302.
|
| 167 |
+
echo "⏳ Waiting for Grafana (http://localhost:3000) ..."; \
|
| 168 |
+
for i in $$(seq 1 30); do \
|
| 169 |
+
code=$$(curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/login || true); \
|
| 170 |
+
if [ "$$code" = "200" ] || [ "$$code" = "302" ]; then \
|
| 171 |
+
echo "✅ Grafana is up"; \
|
| 172 |
+
break; \
|
| 173 |
+
fi; \
|
| 174 |
+
sleep 3; \
|
| 175 |
+
if [ $$i -eq 30 ]; then \
|
| 176 |
+
echo "❌ Grafana did not become ready in time"; \
|
| 177 |
+
exit 1; \
|
| 178 |
+
fi; \
|
| 179 |
+
done; \
|
| 180 |
+
\
|
| 181 |
+
# 4) Run smoke to populate metrics
|
| 182 |
+
echo "🚀 Running smoke traffic ..."; \
|
| 183 |
+
$(MAKE) smoke; \
|
| 184 |
+
echo "🎉 Observability stack is live. Open: Prometheus → http://localhost:9090 , Grafana → http://localhost:3000"
|
| 185 |
+
# 5) Auto-import Grafana dashboard
|
| 186 |
+
$(MAKE) grafana-import
|
| 187 |
+
|
| 188 |
+
# Tear down the observability stack
|
| 189 |
+
obs-down:
|
| 190 |
+
docker compose -f docker-compose.prom.yml down
|
| 191 |
+
|
| 192 |
+
# Tail logs of both services
|
| 193 |
+
obs-logs:
|
| 194 |
+
docker compose -f docker-compose.prom.yml logs -f
|
| 195 |
+
|
| 196 |
+
# ---------- Grafana Auto Import ----------
|
| 197 |
+
.PHONY: grafana-import
|
| 198 |
+
|
| 199 |
+
# Import dashboard JSON into Grafana via HTTP API
|
| 200 |
+
grafana-import:
|
| 201 |
+
@set -e; \
|
| 202 |
+
echo "⏳ Waiting for Grafana API to become ready..."; \
|
| 203 |
+
for i in $$(seq 1 30); do \
|
| 204 |
+
code=$$(curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/api/health || true); \
|
| 205 |
+
if [ "$$code" = "200" ]; then \
|
| 206 |
+
echo "✅ Grafana API is ready"; \
|
| 207 |
+
break; \
|
| 208 |
+
fi; \
|
| 209 |
+
sleep 3; \
|
| 210 |
+
if [ $$i -eq 30 ]; then \
|
| 211 |
+
echo "❌ Grafana API did not become ready in time"; \
|
| 212 |
+
exit 1; \
|
| 213 |
+
fi; \
|
| 214 |
+
done; \
|
| 215 |
+
\
|
| 216 |
+
echo "📦 Importing dashboard ..."; \
|
| 217 |
+
curl -s -X POST http://admin:admin@localhost:3000/api/dashboards/db \
|
| 218 |
+
-H "Content-Type: application/json" \
|
| 219 |
+
-d "{\"dashboard\": $$(cat prometheus/grafana_dashboard.json), \"overwrite\": true, \"folderId\": 0}" \
|
| 220 |
+
| jq -r '.status' || true; \
|
| 221 |
+
echo "🎉 Dashboard imported → http://localhost:3000/dashboards"
|
docker-compose.prom.yml
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
version: "3.8"
|
|
|
|
| 2 |
services:
|
| 3 |
prometheus:
|
| 4 |
image: prom/prometheus:v2.55.0
|
|
@@ -9,5 +10,32 @@ services:
|
|
| 9 |
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
| 10 |
- ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
|
| 11 |
ports:
|
| 12 |
-
- "9090:9090"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
restart: unless-stopped
|
|
|
|
| 1 |
version: "3.8"
|
| 2 |
+
|
| 3 |
services:
|
| 4 |
prometheus:
|
| 5 |
image: prom/prometheus:v2.55.0
|
|
|
|
| 10 |
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
| 11 |
- ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
|
| 12 |
ports:
|
| 13 |
+
- "9090:9090" # Prometheus UI
|
| 14 |
+
networks:
|
| 15 |
+
- observability
|
| 16 |
+
restart: unless-stopped
|
| 17 |
+
|
| 18 |
+
grafana:
|
| 19 |
+
image: grafana/grafana:latest
|
| 20 |
+
container_name: nl2sql-grafana
|
| 21 |
+
ports:
|
| 22 |
+
- "3000:3000" # Grafana UI
|
| 23 |
+
depends_on:
|
| 24 |
+
- prometheus
|
| 25 |
+
environment:
|
| 26 |
+
- GF_SECURITY_ADMIN_USER=admin
|
| 27 |
+
- GF_SECURITY_ADMIN_PASSWORD=admin
|
| 28 |
+
- GF_USERS_ALLOW_SIGN_UP=false
|
| 29 |
+
- GF_AUTH_ANONYMOUS_ENABLED=false
|
| 30 |
+
# Optional hardening:
|
| 31 |
+
- GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/etc/grafana/provisioning/dashboards/nl2sql.json
|
| 32 |
+
volumes:
|
| 33 |
+
# Provisioning (datasource + dashboard providers)
|
| 34 |
+
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
|
| 35 |
+
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
|
| 36 |
+
|
| 37 |
+
# The actual dashboard JSON (mounted into the same dashboards dir)
|
| 38 |
+
- ./prometheus/grafana_dashboard.json:/etc/grafana/provisioning/dashboards/nl2sql.json:ro
|
| 39 |
+
networks:
|
| 40 |
+
- observability
|
| 41 |
restart: unless-stopped
|
docs/observability.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Observability and Metrics
|
| 2 |
+
|
| 3 |
+
This module adds full observability for the NL2SQL Copilot pipeline.
|
| 4 |
+
|
| 5 |
+
## 📊 Metrics exposed
|
| 6 |
+
|
| 7 |
+
| Metric | Type | Labels | Description |
|
| 8 |
+
|--------|------|---------|--------------|
|
| 9 |
+
| `stage_duration_ms` | histogram | `stage` | Duration per stage (detector, planner, generator, safety, executor, verifier) |
|
| 10 |
+
| `pipeline_runs_total` | counter | `status` | Pipeline runs by outcome (`ok`, `error`, `ambiguous`) |
|
| 11 |
+
| `safety_checks_total`, `safety_blocks_total` | counter | `reason` | Number of safety checks and blocked queries |
|
| 12 |
+
| `verifier_checks_total`, `verifier_failures_total` | counter | `reason` | Number of verification passes and failures |
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## ⚙️ Recording & Alerting Rules
|
| 17 |
+
|
| 18 |
+
Defined in `prometheus/rules.yml`:
|
| 19 |
+
|
| 20 |
+
- **`nl2sql:stage_p95_ms`** – 95th percentile latency per stage
|
| 21 |
+
- **`nl2sql:pipeline_success_ratio`** – 5-minute success ratio
|
| 22 |
+
- Alerts:
|
| 23 |
+
- `PipelineLowSuccessRatio` (<90% for 10m)
|
| 24 |
+
- `GeneratorLatencyHigh` (>1500 ms for 5m)
|
| 25 |
+
- `SafetyBlocksSpike` (>0.5/min)
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## 🧪 Local Testing
|
| 30 |
+
|
| 31 |
+
1. Start Prometheus
|
| 32 |
+
```bash
|
| 33 |
+
make prom-up
|
grafana/provisioning/dashboards/dashboard.yml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
providers:
|
| 4 |
+
- name: "NL2SQL Dashboards"
|
| 5 |
+
orgId: 1
|
| 6 |
+
folder: ""
|
| 7 |
+
type: file
|
| 8 |
+
disableDeletion: true # Prevent dashboard deletion from UI
|
| 9 |
+
editable: false
|
| 10 |
+
updateIntervalSeconds: 10 # How often Grafana scans for changes
|
| 11 |
+
options:
|
| 12 |
+
path: /etc/grafana/provisioning/dashboards
|
| 13 |
+
# Grafana will load any *.json from this directory
|
grafana/provisioning/datasources/datasource.yml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
datasources:
|
| 4 |
+
- name: Prometheus
|
| 5 |
+
type: prometheus
|
| 6 |
+
access: proxy # Grafana will proxy queries
|
| 7 |
+
url: http://prometheus:9090
|
| 8 |
+
isDefault: true
|
| 9 |
+
editable: false
|
| 10 |
+
jsonData:
|
| 11 |
+
httpMethod: GET # Use GET for Prometheus
|
prometheus/grafana_dashboard.json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "NL2SQL Copilot - Observability",
|
| 3 |
+
"editable": true,
|
| 4 |
+
"panels": [
|
| 5 |
+
{
|
| 6 |
+
"type": "timeseries",
|
| 7 |
+
"title": "Stage p95 Latency (ms)",
|
| 8 |
+
"targets": [
|
| 9 |
+
{
|
| 10 |
+
"expr": "nl2sql:stage_p95_ms",
|
| 11 |
+
"legendFormat": "{{stage}}",
|
| 12 |
+
"refId": "A"
|
| 13 |
+
}
|
| 14 |
+
],
|
| 15 |
+
"fieldConfig": {
|
| 16 |
+
"defaults": {
|
| 17 |
+
"unit": "milliseconds",
|
| 18 |
+
"decimals": 0
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
"id": 1
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"type": "timeseries",
|
| 25 |
+
"title": "Pipeline Success Ratio",
|
| 26 |
+
"targets": [
|
| 27 |
+
{
|
| 28 |
+
"expr": "nl2sql:pipeline_success_ratio",
|
| 29 |
+
"legendFormat": "success ratio",
|
| 30 |
+
"refId": "B"
|
| 31 |
+
}
|
| 32 |
+
],
|
| 33 |
+
"fieldConfig": {
|
| 34 |
+
"defaults": {
|
| 35 |
+
"min": 0,
|
| 36 |
+
"max": 1,
|
| 37 |
+
"decimals": 2
|
| 38 |
+
}
|
| 39 |
+
},
|
| 40 |
+
"id": 2
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"type": "timeseries",
|
| 44 |
+
"title": "Safety & Verifier Events",
|
| 45 |
+
"targets": [
|
| 46 |
+
{
|
| 47 |
+
"expr": "rate(safety_blocks_total[5m])",
|
| 48 |
+
"legendFormat": "safety blocks/min",
|
| 49 |
+
"refId": "C"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"expr": "rate(verifier_failures_total[5m])",
|
| 53 |
+
"legendFormat": "verifier failures/min",
|
| 54 |
+
"refId": "D"
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"fieldConfig": {
|
| 58 |
+
"defaults": {
|
| 59 |
+
"min": 0
|
| 60 |
+
}
|
| 61 |
+
},
|
| 62 |
+
"id": 3
|
| 63 |
+
}
|
| 64 |
+
],
|
| 65 |
+
"schemaVersion": 38,
|
| 66 |
+
"version": 1,
|
| 67 |
+
"refresh": "30s"
|
| 68 |
+
}
|