Melika Kheirieh commited on
Commit
c24bfe8
·
1 Parent(s): 4fe45ad

feat(metrics): integrate Prometheus; expose /metrics; instrument safety & verifier

Browse files
Files changed (6) hide show
  1. app/main.py +58 -34
  2. app/routers/dev.py +65 -0
  3. nl2sql/metrics.py +60 -0
  4. nl2sql/prom.py +3 -0
  5. nl2sql/safety.py +22 -0
  6. nl2sql/verifier.py +24 -0
app/main.py CHANGED
@@ -1,14 +1,10 @@
1
  import os
2
  import time
3
  from fastapi import FastAPI, Request, Response, HTTPException
4
- from fastapi.responses import PlainTextResponse
5
- from prometheus_client import (
6
- Counter,
7
- Histogram,
8
- CollectorRegistry,
9
- generate_latest,
10
- CONTENT_TYPE_LATEST,
11
- )
12
 
13
  try:
14
  from dotenv import load_dotenv
@@ -19,7 +15,7 @@ except Exception:
19
 
20
  from app.routers import nl2sql
21
 
22
- # ---- Optionally restore uploaded DB map ----
23
  try:
24
  from app.routers.nl2sql import _load_db_map
25
 
@@ -27,17 +23,24 @@ try:
27
  except Exception as e:
28
  print(f"⚠️ DB map not restored: {e}")
29
 
30
- application: FastAPI = FastAPI(
 
 
 
31
  title="NL2SQL Copilot Prototype",
32
  version=os.getenv("APP_VERSION", "0.1.0"),
33
  description="Convert natural language to safe & verified SQL",
34
  )
35
 
36
- application.include_router(nl2sql.router, prefix="/api/v1") # e.g. /api/v1/nl2sql
37
- application.include_router(nl2sql.router) # e.g. /nl2sql
38
 
39
- # ---- Prometheus metrics ----
40
- REGISTRY = CollectorRegistry()
 
 
 
 
41
  REQUEST_COUNT = Counter(
42
  "http_requests_total",
43
  "Total HTTP requests",
@@ -46,7 +49,7 @@ REQUEST_COUNT = Counter(
46
  )
47
  REQUEST_LATENCY = Histogram(
48
  "http_request_latency_seconds",
49
- "Request latency",
50
  ["path", "method"],
51
  registry=REGISTRY,
52
  )
@@ -58,23 +61,26 @@ async def metrics_middleware(request: Request, call_next):
58
  response: Response = await call_next(request)
59
  elapsed = time.perf_counter() - start
60
  route = request.scope.get("route")
61
- path = route.path if route else request.url.path
 
 
62
  REQUEST_COUNT.labels(
63
- path=path, method=request.method, status_code=str(response.status_code)
 
 
64
  ).inc()
65
- REQUEST_LATENCY.labels(path=path, method=request.method).observe(elapsed)
66
  return response
67
 
68
 
69
- # --- Liveness ---
 
 
70
  @application.get("/healthz", response_class=PlainTextResponse, tags=["system"])
71
  def healthz() -> str:
72
  return "ok"
73
 
74
 
75
- # --- Readiness ---
76
-
77
-
78
  @application.get("/readyz", response_class=PlainTextResponse, tags=["system"])
79
  def readyz() -> str:
80
  mode = os.getenv("DB_MODE", "sqlite").lower()
@@ -82,19 +88,17 @@ def readyz() -> str:
82
  if mode == "postgres":
83
  from adapters.db.postgres_adapter import PostgresAdapter
84
 
85
- dsn = os.environ["POSTGRES_DSN"]
86
- pg = PostgresAdapter(dsn)
87
- ping = getattr(pg, "ping", None)
88
- if callable(ping):
89
- ping()
90
  else:
91
  from adapters.db.sqlite_adapter import SQLiteAdapter
92
 
93
- db_path = os.getenv("SQLITE_DB_PATH", "data/chinook.db")
94
- sq = SQLiteAdapter(db_path)
95
- ping = getattr(sq, "ping", None)
96
- if callable(ping):
97
- ping()
98
  return "ready"
99
  except Exception:
100
  raise HTTPException(status_code=503, detail="not ready")
@@ -116,6 +120,26 @@ def metrics():
116
  return Response(content=data, media_type=CONTENT_TYPE_LATEST)
117
 
118
 
119
- # Backward compatibility for tests & uvicorn targets
120
- app: FastAPI = application
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  __all__ = ["application", "app"]
 
1
  import os
2
  import time
3
  from fastapi import FastAPI, Request, Response, HTTPException
4
+ from fastapi.responses import PlainTextResponse, RedirectResponse
5
+ from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
6
+ from nl2sql.prom import REGISTRY
7
+ from app.routers import dev
 
 
 
 
8
 
9
  try:
10
  from dotenv import load_dotenv
 
15
 
16
  from app.routers import nl2sql
17
 
18
+ # ---- Optional DB map restore ----
19
  try:
20
  from app.routers.nl2sql import _load_db_map
21
 
 
23
  except Exception as e:
24
  print(f"⚠️ DB map not restored: {e}")
25
 
26
+ # ----------------------------------------------------------------------------
27
+ # App definition
28
+ # ----------------------------------------------------------------------------
29
+ application = FastAPI(
30
  title="NL2SQL Copilot Prototype",
31
  version=os.getenv("APP_VERSION", "0.1.0"),
32
  description="Convert natural language to safe & verified SQL",
33
  )
34
 
35
+ # Register only versioned API
36
+ application.include_router(nl2sql.router, prefix="/api/v1")
37
 
38
+ # Register Dev-only routes (only when APP_ENV=dev)
39
+ if os.getenv("APP_ENV", "dev").lower() == "dev":
40
+ application.include_router(dev.router, prefix="/api/v1")
41
+ # ----------------------------------------------------------------------------
42
+ # Prometheus Metrics Middleware
43
+ # ----------------------------------------------------------------------------
44
  REQUEST_COUNT = Counter(
45
  "http_requests_total",
46
  "Total HTTP requests",
 
49
  )
50
  REQUEST_LATENCY = Histogram(
51
  "http_request_latency_seconds",
52
+ "Request latency (seconds)",
53
  ["path", "method"],
54
  registry=REGISTRY,
55
  )
 
61
  response: Response = await call_next(request)
62
  elapsed = time.perf_counter() - start
63
  route = request.scope.get("route")
64
+ path = getattr(route, "path", None) or request.url.path
65
+ name = getattr(route, "name", None) or path
66
+
67
  REQUEST_COUNT.labels(
68
+ path=name,
69
+ method=request.method,
70
+ status_code=str(getattr(response, "status_code", 500)),
71
  ).inc()
72
+ REQUEST_LATENCY.labels(path=name, method=request.method).observe(elapsed)
73
  return response
74
 
75
 
76
+ # ----------------------------------------------------------------------------
77
+ # System Endpoints
78
+ # ----------------------------------------------------------------------------
79
  @application.get("/healthz", response_class=PlainTextResponse, tags=["system"])
80
  def healthz() -> str:
81
  return "ok"
82
 
83
 
 
 
 
84
  @application.get("/readyz", response_class=PlainTextResponse, tags=["system"])
85
  def readyz() -> str:
86
  mode = os.getenv("DB_MODE", "sqlite").lower()
 
88
  if mode == "postgres":
89
  from adapters.db.postgres_adapter import PostgresAdapter
90
 
91
+ pg = PostgresAdapter(os.environ["POSTGRES_DSN"])
92
+ ping_fn = getattr(pg, "ping", None)
93
+ if callable(ping_fn):
94
+ ping_fn()
 
95
  else:
96
  from adapters.db.sqlite_adapter import SQLiteAdapter
97
 
98
+ sq = SQLiteAdapter(os.getenv("SQLITE_DB_PATH", "data/chinook.db"))
99
+ ping_fn = getattr(sq, "ping", None)
100
+ if callable(ping_fn):
101
+ ping_fn()
 
102
  return "ready"
103
  except Exception:
104
  raise HTTPException(status_code=503, detail="not ready")
 
120
  return Response(content=data, media_type=CONTENT_TYPE_LATEST)
121
 
122
 
123
+ # ----------------------------------------------------------------------------
124
+ # Legacy Redirects (clean compatibility)
125
+ # ----------------------------------------------------------------------------
126
+ @application.api_route("/nl2sql", methods=["GET", "POST"])
127
+ async def legacy_nl2sql_redirect(request: Request):
128
+ return RedirectResponse(url="/api/v1/nl2sql", status_code=307)
129
+
130
+
131
+ @application.api_route(
132
+ "/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"]
133
+ )
134
+ async def legacy_catch_all(request: Request, path: str):
135
+ """Redirect old root-level endpoints to versioned API."""
136
+ if path.startswith("api/v1"):
137
+ return RedirectResponse(url=f"/{path}", status_code=307)
138
+ return RedirectResponse(url=f"/api/v1/{path}", status_code=307)
139
+
140
+
141
+ # ----------------------------------------------------------------------------
142
+ # Backward-compatible alias for uvicorn
143
+ # ----------------------------------------------------------------------------
144
+ app = application
145
  __all__ = ["application", "app"]
app/routers/dev.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from pydantic import BaseModel
3
+ from nl2sql.safety import Safety
4
+ from nl2sql.verifier import Verifier
5
+
6
+ # pick adapter for verifier (SQLite default)
7
+ from adapters.db.sqlite_adapter import SQLiteAdapter
8
+
9
+ from dataclasses import is_dataclass, asdict
10
+ from typing import Any
11
+
12
+
13
+ def _is_dataclass_instance(x: Any) -> bool:
14
+ # True only for dataclass *instances* (not classes)
15
+ return is_dataclass(x) and not isinstance(x, type)
16
+
17
+
18
+ def _to_dict(obj: Any) -> dict:
19
+ # Pydantic v2
20
+ if hasattr(obj, "model_dump"):
21
+ return obj.model_dump() # type: ignore[no-any-return]
22
+ # Pydantic v1
23
+ if hasattr(obj, "dict"):
24
+ return obj.dict() # type: ignore[no-any-return]
25
+ # Dataclass instance
26
+ if _is_dataclass_instance(obj):
27
+ return asdict(obj) # type: ignore[arg-type]
28
+ # Plain object
29
+ if hasattr(obj, "__dict__"):
30
+ return {k: v for k, v in obj.__dict__.items() if not k.startswith("_")}
31
+ return {"value": str(obj)}
32
+
33
+
34
+ router = APIRouter(prefix="/_dev", tags=["dev"])
35
+
36
+
37
+ class SQLBody(BaseModel):
38
+ sql: str
39
+
40
+
41
+ @router.post("/safety")
42
+ def dev_safety_check(body: SQLBody):
43
+ """
44
+ Run the Safety stage directly on a raw SQL string.
45
+ Used for metrics validation (Prometheus counters).
46
+ """
47
+ s = Safety()
48
+ res = s.check(body.sql)
49
+ return _to_dict(res)
50
+
51
+
52
+ @router.post("/verifier")
53
+ def dev_verifier_check(body: SQLBody):
54
+ """
55
+ Run the Verifier stage directly on a raw SQL string
56
+ with a real adapter connection.
57
+ """
58
+ try:
59
+ adapter = SQLiteAdapter("data/chinook.db")
60
+ except Exception as e:
61
+ raise HTTPException(status_code=500, detail=f"Adapter init failed: {e}")
62
+
63
+ v = Verifier()
64
+ res = v.verify(body.sql, adapter=adapter)
65
+ return _to_dict(res)
nl2sql/metrics.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from prometheus_client import Counter, Histogram
2
+ from nl2sql.prom import REGISTRY
3
+
4
+
5
+ # -----------------------------------------------------------------------------
6
+ # Stage-level metrics
7
+ # -----------------------------------------------------------------------------
8
+ stage_duration_ms = Histogram(
9
+ "stage_duration_ms",
10
+ "Duration (ms) of each pipeline stage",
11
+ ["stage"], # e.g. detector|planner|generator|safety|verifier
12
+ buckets=(1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000),
13
+ registry=REGISTRY,
14
+ )
15
+
16
+ # -----------------------------------------------------------------------------
17
+ # Safety stage metrics
18
+ # -----------------------------------------------------------------------------
19
+ safety_blocks_total = Counter(
20
+ "safety_blocks_total",
21
+ "Count of blocked SQL queries by safety checks",
22
+ [
23
+ "reason"
24
+ ], # e.g. forbidden_keyword, multiple_statements, non_readonly, explain_not_allowed
25
+ registry=REGISTRY,
26
+ )
27
+
28
+ safety_checks_total = Counter(
29
+ "safety_checks_total",
30
+ "Total SQL queries checked by safety",
31
+ ["ok"], # "true" or "false"
32
+ registry=REGISTRY,
33
+ )
34
+
35
+ # -----------------------------------------------------------------------------
36
+ # Verifier stage metrics
37
+ # -----------------------------------------------------------------------------
38
+ verifier_checks_total = Counter(
39
+ "verifier_checks_total",
40
+ "Count of verifier checks (success/failure)",
41
+ ["ok"], # "true" | "false"
42
+ registry=REGISTRY,
43
+ )
44
+
45
+ verifier_failures_total = Counter(
46
+ "verifier_failures_total",
47
+ "Count of verifier failures by type",
48
+ ["reason"], # e.g. parse_error, semantic_check_error, adapter_failure
49
+ registry=REGISTRY,
50
+ )
51
+
52
+ # -----------------------------------------------------------------------------
53
+ # Pipeline-level metrics
54
+ # -----------------------------------------------------------------------------
55
+ pipeline_runs_total = Counter(
56
+ "pipeline_runs_total",
57
+ "Total number of full pipeline runs",
58
+ ["status"], # ok | error | ambiguous
59
+ registry=REGISTRY,
60
+ )
nl2sql/prom.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from prometheus_client import CollectorRegistry
2
+
3
+ REGISTRY = CollectorRegistry()
nl2sql/safety.py CHANGED
@@ -7,6 +7,8 @@ from typing import List, Pattern
7
  import sqlglot
8
 
9
  from nl2sql.types import StageResult, StageTrace
 
 
10
 
11
  # ------------------------- Zero-width & basic regexes -------------------------
12
 
@@ -166,12 +168,16 @@ class Safety:
166
 
167
  # 0) nil / size guard
168
  if not sql or not sql.strip():
 
 
169
  return StageResult(
170
  ok=False,
171
  error=["empty_sql"],
172
  trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
173
  )
174
  if len(sql) > _MAX_SQL_LEN:
 
 
175
  return StageResult(
176
  ok=False,
177
  error=["sql_too_long"],
@@ -185,6 +191,8 @@ class Safety:
185
  semicolon_count = _count_statements_semicolon(body)
186
  glot_count = _count_statements_sqlglot(body)
187
  if semicolon_count != 1 or glot_count != 1:
 
 
188
  return StageResult(
189
  ok=False,
190
  error=["Multiple statements detected"],
@@ -203,6 +211,8 @@ class Safety:
203
  m = _FORBIDDEN.search(scan_body)
204
  if m:
205
  tok = m.group(0).strip().lower()
 
 
206
  return StageResult(
207
  ok=False,
208
  error=[f"Forbidden: {tok}"],
@@ -212,6 +222,8 @@ class Safety:
212
  m2 = rx.search(scan_body)
213
  if m2:
214
  tok = m2.group(0).strip().lower()
 
 
215
  return StageResult(
216
  ok=False,
217
  error=[f"Forbidden: {tok}"],
@@ -223,6 +235,8 @@ class Safety:
223
  trees = sqlglot.parse(body)
224
  root = trees[0]
225
  except Exception as e:
 
 
226
  return StageResult(
227
  ok=False,
228
  error=["parse_error"],
@@ -241,6 +255,8 @@ class Safety:
241
  t2 = sqlglot.parse_one(remainder)
242
  t2_type = type(t2).__name__.lower() if t2 else ""
243
  if t2_type in {"select", "with"}:
 
 
244
  return StageResult(
245
  ok=True,
246
  data={
@@ -259,6 +275,8 @@ class Safety:
259
  is_explain = root_type == "explain"
260
 
261
  if is_explain and not self.allow_explain:
 
 
262
  return StageResult(
263
  ok=False,
264
  error=["EXPLAIN not allowed"],
@@ -266,6 +284,8 @@ class Safety:
266
  )
267
 
268
  if not (is_select_like or (is_explain and self.allow_explain)):
 
 
269
  return StageResult(
270
  ok=False,
271
  error=[f"Non-SELECT statement: {root_type}"],
@@ -273,6 +293,8 @@ class Safety:
273
  )
274
 
275
  # 5) success
 
 
276
  return StageResult(
277
  ok=True,
278
  data={
 
7
  import sqlglot
8
 
9
  from nl2sql.types import StageResult, StageTrace
10
+ from nl2sql.metrics import safety_blocks_total, stage_duration_ms, safety_checks_total
11
+
12
 
13
  # ------------------------- Zero-width & basic regexes -------------------------
14
 
 
168
 
169
  # 0) nil / size guard
170
  if not sql or not sql.strip():
171
+ safety_blocks_total.labels(reason="empty_sql").inc()
172
+ safety_checks_total.labels(ok="false").inc()
173
  return StageResult(
174
  ok=False,
175
  error=["empty_sql"],
176
  trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
177
  )
178
  if len(sql) > _MAX_SQL_LEN:
179
+ safety_blocks_total.labels(reason="sql_too_long").inc()
180
+ safety_checks_total.labels(ok="false").inc()
181
  return StageResult(
182
  ok=False,
183
  error=["sql_too_long"],
 
191
  semicolon_count = _count_statements_semicolon(body)
192
  glot_count = _count_statements_sqlglot(body)
193
  if semicolon_count != 1 or glot_count != 1:
194
+ safety_blocks_total.labels(reason="multiple_statements").inc()
195
+ safety_checks_total.labels(ok="false").inc()
196
  return StageResult(
197
  ok=False,
198
  error=["Multiple statements detected"],
 
211
  m = _FORBIDDEN.search(scan_body)
212
  if m:
213
  tok = m.group(0).strip().lower()
214
+ safety_blocks_total.labels(reason="forbidden_keyword").inc()
215
+ safety_checks_total.labels(ok="false").inc()
216
  return StageResult(
217
  ok=False,
218
  error=[f"Forbidden: {tok}"],
 
222
  m2 = rx.search(scan_body)
223
  if m2:
224
  tok = m2.group(0).strip().lower()
225
+ safety_blocks_total.labels(reason="forbidden_keyword").inc()
226
+ safety_checks_total.labels(ok="false").inc()
227
  return StageResult(
228
  ok=False,
229
  error=[f"Forbidden: {tok}"],
 
235
  trees = sqlglot.parse(body)
236
  root = trees[0]
237
  except Exception as e:
238
+ safety_blocks_total.labels(reason="parse_error").inc()
239
+ safety_checks_total.labels(ok="false").inc()
240
  return StageResult(
241
  ok=False,
242
  error=["parse_error"],
 
255
  t2 = sqlglot.parse_one(remainder)
256
  t2_type = type(t2).__name__.lower() if t2 else ""
257
  if t2_type in {"select", "with"}:
258
+ stage_duration_ms.labels("safety").observe(_ms(t0) / 1.0)
259
+ safety_checks_total.labels(ok="true").inc()
260
  return StageResult(
261
  ok=True,
262
  data={
 
275
  is_explain = root_type == "explain"
276
 
277
  if is_explain and not self.allow_explain:
278
+ safety_blocks_total.labels(reason="explain_not_allowed").inc()
279
+ safety_checks_total.labels(ok="false").inc()
280
  return StageResult(
281
  ok=False,
282
  error=["EXPLAIN not allowed"],
 
284
  )
285
 
286
  if not (is_select_like or (is_explain and self.allow_explain)):
287
+ safety_blocks_total.labels(reason="non_select").inc()
288
+ safety_checks_total.labels(ok="false").inc()
289
  return StageResult(
290
  ok=False,
291
  error=[f"Non-SELECT statement: {root_type}"],
 
293
  )
294
 
295
  # 5) success
296
+ stage_duration_ms.labels("safety").observe(_ms(t0) / 1.0)
297
+ safety_checks_total.labels(ok="true").inc()
298
  return StageResult(
299
  ok=True,
300
  data={
nl2sql/verifier.py CHANGED
@@ -8,6 +8,11 @@ import sqlglot
8
  from sqlglot import expressions as exp
9
 
10
  from nl2sql.types import StageResult, StageTrace
 
 
 
 
 
11
 
12
 
13
  def _ms(t0: float) -> int:
@@ -182,6 +187,8 @@ class Verifier:
182
  # Check for common sqlglot error indicators
183
  # When sqlglot can't parse properly, it often creates Command or Unknown nodes
184
  if tree_type in ("Command", "Unknown"):
 
 
185
  return StageResult(
186
  ok=False,
187
  error=["parse_error"],
@@ -190,6 +197,8 @@ class Verifier:
190
 
191
  # Also check if the tree has errors attribute (some versions of sqlglot)
192
  if hasattr(tree, "errors") and tree.errors:
 
 
193
  return StageResult(
194
  ok=False,
195
  error=["parse_error"],
@@ -207,6 +216,8 @@ class Verifier:
207
  for kw in ["selct", "slect", "selet", "seelct"]
208
  ):
209
  # Common misspellings of SELECT
 
 
210
  return StageResult(
211
  ok=False,
212
  error=["parse_error"],
@@ -214,6 +225,8 @@ class Verifier:
214
  )
215
 
216
  except Exception:
 
 
217
  return StageResult(
218
  ok=False,
219
  error=["parse_error"],
@@ -248,9 +261,11 @@ class Verifier:
248
  and any_nonagg_col
249
  and not (has_group or has_window or is_distinct)
250
  ):
 
251
  issues.append("aggregation_without_group_by")
252
  except Exception as e:
253
  # Don't crash the verifier; surface a soft issue and let fallback run
 
254
  issues.append(f"semantic_check_error:{e!s}")
255
 
256
  # 3) Fallback textual scan — only if AST didn't already flag
@@ -276,6 +291,9 @@ class Verifier:
276
  select_list = m_sel.group("sel")
277
  # a comma strongly suggests mixing aggregate and non-aggregate in projection
278
  if "," in select_list:
 
 
 
279
  issues.append("aggregation_without_group_by")
280
  except Exception:
281
  # ignore fallback errors
@@ -287,12 +305,16 @@ class Verifier:
287
  ok_val = self._extract_ok(exec_result)
288
  if ok_val is False:
289
  err = self._extract_error(exec_result)
 
290
  issues.append(f"exec_error:{err}" if err else "exec_error")
291
  except Exception as e:
 
292
  issues.append(f"exec_exception:{e!s}")
293
 
294
  # 5) Final decision — AFTER all checks (note: no early return before fallback)
295
  if issues:
 
 
296
  return StageResult(
297
  ok=False,
298
  error=issues,
@@ -301,6 +323,8 @@ class Verifier:
301
  ),
302
  )
303
 
 
 
304
  return StageResult(
305
  ok=True,
306
  data={"verified": True},
 
8
  from sqlglot import expressions as exp
9
 
10
  from nl2sql.types import StageResult, StageTrace
11
+ from nl2sql.metrics import (
12
+ verifier_checks_total,
13
+ stage_duration_ms,
14
+ verifier_failures_total,
15
+ )
16
 
17
 
18
  def _ms(t0: float) -> int:
 
187
  # Check for common sqlglot error indicators
188
  # When sqlglot can't parse properly, it often creates Command or Unknown nodes
189
  if tree_type in ("Command", "Unknown"):
190
+ verifier_checks_total.labels(ok="false").inc()
191
+ verifier_failures_total.labels(reason="parse_error").inc()
192
  return StageResult(
193
  ok=False,
194
  error=["parse_error"],
 
197
 
198
  # Also check if the tree has errors attribute (some versions of sqlglot)
199
  if hasattr(tree, "errors") and tree.errors:
200
+ verifier_checks_total.labels(ok="false").inc()
201
+ verifier_failures_total.labels(reason="parse_error").inc()
202
  return StageResult(
203
  ok=False,
204
  error=["parse_error"],
 
216
  for kw in ["selct", "slect", "selet", "seelct"]
217
  ):
218
  # Common misspellings of SELECT
219
+ verifier_checks_total.labels(ok="false").inc()
220
+ verifier_failures_total.labels(reason="parse_error").inc()
221
  return StageResult(
222
  ok=False,
223
  error=["parse_error"],
 
225
  )
226
 
227
  except Exception:
228
+ verifier_checks_total.labels(ok="false").inc()
229
+ verifier_failures_total.labels(reason="parse_error").inc()
230
  return StageResult(
231
  ok=False,
232
  error=["parse_error"],
 
261
  and any_nonagg_col
262
  and not (has_group or has_window or is_distinct)
263
  ):
264
+ verifier_failures_total.labels(reason="semantic_error").inc()
265
  issues.append("aggregation_without_group_by")
266
  except Exception as e:
267
  # Don't crash the verifier; surface a soft issue and let fallback run
268
+ verifier_failures_total.labels(reason="semantic_error").inc()
269
  issues.append(f"semantic_check_error:{e!s}")
270
 
271
  # 3) Fallback textual scan — only if AST didn't already flag
 
291
  select_list = m_sel.group("sel")
292
  # a comma strongly suggests mixing aggregate and non-aggregate in projection
293
  if "," in select_list:
294
+ verifier_failures_total.labels(
295
+ reason="agg_without_group_by"
296
+ ).inc()
297
  issues.append("aggregation_without_group_by")
298
  except Exception:
299
  # ignore fallback errors
 
305
  ok_val = self._extract_ok(exec_result)
306
  if ok_val is False:
307
  err = self._extract_error(exec_result)
308
+ verifier_failures_total.labels(reason="preview_exec_error").inc()
309
  issues.append(f"exec_error:{err}" if err else "exec_error")
310
  except Exception as e:
311
+ verifier_failures_total.labels(reason="preview_exec_error").inc()
312
  issues.append(f"exec_exception:{e!s}")
313
 
314
  # 5) Final decision — AFTER all checks (note: no early return before fallback)
315
  if issues:
316
+ verifier_checks_total.labels(ok="false").inc()
317
+ stage_duration_ms.labels("verifier").observe(_ms(t0) / 1.0)
318
  return StageResult(
319
  ok=False,
320
  error=issues,
 
323
  ),
324
  )
325
 
326
+ verifier_checks_total.labels(ok="true").inc()
327
+ stage_duration_ms.labels("verifier").observe(_ms(t0) / 1.0)
328
  return StageResult(
329
  ok=True,
330
  data={"verified": True},