Spaces:
Sleeping
Sleeping
github-actions[bot] commited on
Commit ·
562f213
1
Parent(s): c743376
Sync from GitHub main @ d3788163c1b28737c76fe3930fe9f123a0a2d084
Browse files- adapters/metrics/__init__.py +0 -0
- adapters/metrics/base.py +11 -0
- adapters/metrics/noop.py +14 -0
- adapters/metrics/prometheus.py +16 -0
- nl2sql/pipeline.py +27 -24
- nl2sql/pipeline_factory.py +12 -0
adapters/metrics/__init__.py
ADDED
|
File without changes
|
adapters/metrics/base.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Protocol
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Metrics(Protocol):
|
| 7 |
+
def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None: ...
|
| 8 |
+
|
| 9 |
+
def inc_pipeline_run(self, *, status: str) -> None: ...
|
| 10 |
+
|
| 11 |
+
def inc_repair_attempt(self, *, outcome: str) -> None: ...
|
adapters/metrics/noop.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from adapters.metrics.base import Metrics
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class NoOpMetrics(Metrics):
|
| 7 |
+
def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
|
| 8 |
+
return
|
| 9 |
+
|
| 10 |
+
def inc_pipeline_run(self, *, status: str) -> None:
|
| 11 |
+
return
|
| 12 |
+
|
| 13 |
+
def inc_repair_attempt(self, *, outcome: str) -> None:
|
| 14 |
+
return
|
adapters/metrics/prometheus.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from adapters.metrics.base import Metrics
|
| 4 |
+
from nl2sql.metrics import stage_duration_ms, pipeline_runs_total, repair_attempts_total
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class PrometheusMetrics(Metrics):
|
| 8 |
+
def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
|
| 9 |
+
stage_duration_ms.labels(stage).observe(dt_ms)
|
| 10 |
+
|
| 11 |
+
def inc_pipeline_run(self, *, status: str) -> None:
|
| 12 |
+
pipeline_runs_total.labels(status=status).inc()
|
| 13 |
+
|
| 14 |
+
def inc_repair_attempt(self, *, outcome: str) -> None:
|
| 15 |
+
# outcome: attempt | success | failed | skipped
|
| 16 |
+
repair_attempts_total.labels(outcome=outcome).inc()
|
nl2sql/pipeline.py
CHANGED
|
@@ -15,7 +15,8 @@ from nl2sql.executor import Executor
|
|
| 15 |
from nl2sql.verifier import Verifier
|
| 16 |
from nl2sql.repair import Repair
|
| 17 |
from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
|
| 18 |
-
from
|
|
|
|
| 19 |
from nl2sql.errors.codes import ErrorCode
|
| 20 |
from nl2sql.context_engineering.render import render_schema_pack
|
| 21 |
from nl2sql.context_engineering.engineer import ContextEngineer
|
|
@@ -58,6 +59,7 @@ class Pipeline:
|
|
| 58 |
verifier: Optional[Verifier] = None,
|
| 59 |
repair: Optional[Repair] = None,
|
| 60 |
context_engineer: ContextEngineer | None = None,
|
|
|
|
| 61 |
):
|
| 62 |
self.detector = detector
|
| 63 |
self.planner = planner
|
|
@@ -69,6 +71,7 @@ class Pipeline:
|
|
| 69 |
# If the verifier explicitly requires verification, enforce it in finalize.
|
| 70 |
self.require_verification = bool(getattr(self.verifier, "required", False))
|
| 71 |
self.context_engineer = context_engineer
|
|
|
|
| 72 |
|
| 73 |
# ---------------------------- helpers ----------------------------
|
| 74 |
@staticmethod
|
|
@@ -203,7 +206,7 @@ class Pipeline:
|
|
| 203 |
r = self._safe_stage(fn, **kwargs)
|
| 204 |
dt = (time.perf_counter() - t0) * 1000.0
|
| 205 |
|
| 206 |
-
|
| 207 |
|
| 208 |
# attach stage trace
|
| 209 |
if getattr(r, "trace", None):
|
|
@@ -224,7 +227,7 @@ class Pipeline:
|
|
| 224 |
# stage failed → check repair availability
|
| 225 |
eligible, reason = self._should_repair(stage_name, r)
|
| 226 |
if not eligible:
|
| 227 |
-
|
| 228 |
# annotate latest stage trace entry
|
| 229 |
if traces and isinstance(traces[-1], dict):
|
| 230 |
notes = traces[-1].get("notes") or {}
|
|
@@ -243,12 +246,12 @@ class Pipeline:
|
|
| 243 |
repair_args = repair_input_builder(r, kwargs)
|
| 244 |
|
| 245 |
# --- 3) Run repair (always logged) ---
|
| 246 |
-
|
| 247 |
t1 = time.perf_counter()
|
| 248 |
r_fix = self._safe_stage(self.repair.run, **repair_args)
|
| 249 |
dt_fix = (time.perf_counter() - t1) * 1000.0
|
| 250 |
|
| 251 |
-
|
| 252 |
|
| 253 |
if getattr(r_fix, "trace", None):
|
| 254 |
traces.append(r_fix.trace.__dict__)
|
|
@@ -263,7 +266,7 @@ class Pipeline:
|
|
| 263 |
)
|
| 264 |
|
| 265 |
if not r_fix.ok:
|
| 266 |
-
|
| 267 |
return r # repair itself failed → stop here
|
| 268 |
|
| 269 |
# --- 4) Only inject SQL if the stage is an SQL-producing stage ---
|
|
@@ -273,10 +276,10 @@ class Pipeline:
|
|
| 273 |
|
| 274 |
# important: success metric must reflect if repair was applied meaningfully
|
| 275 |
if stage_name in self.SQL_REPAIR_STAGES:
|
| 276 |
-
|
| 277 |
else:
|
| 278 |
# log-only mode counts as a success-attempt but not semantic success
|
| 279 |
-
|
| 280 |
|
| 281 |
# for SQL stages, we re-run the stage again with modified kwargs
|
| 282 |
# for log-only stages, this simply loops and stage is re-run unchanged
|
|
@@ -383,7 +386,7 @@ class Pipeline:
|
|
| 383 |
questions = self.detector.detect(user_query, schema_preview)
|
| 384 |
dt = (time.perf_counter() - t0) * 1000.0
|
| 385 |
is_amb = bool(questions)
|
| 386 |
-
|
| 387 |
traces.append(
|
| 388 |
self._mk_trace(
|
| 389 |
stage="detector",
|
|
@@ -393,7 +396,7 @@ class Pipeline:
|
|
| 393 |
)
|
| 394 |
)
|
| 395 |
if questions:
|
| 396 |
-
|
| 397 |
return FinalResult(
|
| 398 |
ok=True,
|
| 399 |
ambiguous=True,
|
|
@@ -428,7 +431,7 @@ class Pipeline:
|
|
| 428 |
**planner_kwargs,
|
| 429 |
)
|
| 430 |
if not r_plan.ok:
|
| 431 |
-
|
| 432 |
return FinalResult(
|
| 433 |
ok=False,
|
| 434 |
ambiguous=False,
|
|
@@ -466,7 +469,7 @@ class Pipeline:
|
|
| 466 |
**gen_kwargs,
|
| 467 |
)
|
| 468 |
if not r_gen.ok:
|
| 469 |
-
|
| 470 |
return FinalResult(
|
| 471 |
ok=False,
|
| 472 |
ambiguous=False,
|
|
@@ -512,7 +515,7 @@ class Pipeline:
|
|
| 512 |
|
| 513 |
# Guard: empty SQL
|
| 514 |
if not sql or not str(sql).strip():
|
| 515 |
-
|
| 516 |
traces.append(
|
| 517 |
self._mk_trace("generator", 0.0, "failed", {"reason": "empty_sql"})
|
| 518 |
)
|
|
@@ -540,7 +543,7 @@ class Pipeline:
|
|
| 540 |
traces=traces,
|
| 541 |
)
|
| 542 |
if not r_safe.ok:
|
| 543 |
-
|
| 544 |
return FinalResult(
|
| 545 |
ok=False,
|
| 546 |
ambiguous=False,
|
|
@@ -588,7 +591,7 @@ class Pipeline:
|
|
| 588 |
traces=traces,
|
| 589 |
)
|
| 590 |
dt = (time.perf_counter() - t0) * 1000.0
|
| 591 |
-
|
| 592 |
|
| 593 |
# Attach a trace entry if verifier didn't provide one
|
| 594 |
if getattr(r_ver, "trace", None):
|
|
@@ -648,7 +651,7 @@ class Pipeline:
|
|
| 648 |
"schema_preview": schema_for_llm,
|
| 649 |
}
|
| 650 |
|
| 651 |
-
|
| 652 |
r_rep = self.repair.run(**rep_kwargs)
|
| 653 |
|
| 654 |
new_sql = (
|
|
@@ -670,7 +673,7 @@ class Pipeline:
|
|
| 670 |
traces=traces,
|
| 671 |
)
|
| 672 |
if not r_safe2.ok:
|
| 673 |
-
|
| 674 |
return FinalResult(
|
| 675 |
ok=False,
|
| 676 |
ambiguous=False,
|
|
@@ -710,11 +713,11 @@ class Pipeline:
|
|
| 710 |
verified = bool(data2.get("verified") is True)
|
| 711 |
|
| 712 |
if verified:
|
| 713 |
-
|
| 714 |
else:
|
| 715 |
-
|
| 716 |
else:
|
| 717 |
-
|
| 718 |
|
| 719 |
# --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
|
| 720 |
if (verified is None or not verified) and not details:
|
|
@@ -753,7 +756,7 @@ class Pipeline:
|
|
| 753 |
else:
|
| 754 |
verified_final = bool(verified)
|
| 755 |
|
| 756 |
-
|
| 757 |
|
| 758 |
traces.append(
|
| 759 |
self._mk_trace(
|
|
@@ -782,12 +785,12 @@ class Pipeline:
|
|
| 782 |
)
|
| 783 |
|
| 784 |
except Exception:
|
| 785 |
-
|
| 786 |
# bubble up to make failures visible in tests and logs
|
| 787 |
raise
|
| 788 |
|
| 789 |
finally:
|
| 790 |
# Always record total latency, even on early return/exception
|
| 791 |
-
|
| 792 |
-
(time.perf_counter() - t_all0) * 1000.0
|
| 793 |
)
|
|
|
|
| 15 |
from nl2sql.verifier import Verifier
|
| 16 |
from nl2sql.repair import Repair
|
| 17 |
from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
|
| 18 |
+
from adapters.metrics.base import Metrics
|
| 19 |
+
from adapters.metrics.noop import NoOpMetrics
|
| 20 |
from nl2sql.errors.codes import ErrorCode
|
| 21 |
from nl2sql.context_engineering.render import render_schema_pack
|
| 22 |
from nl2sql.context_engineering.engineer import ContextEngineer
|
|
|
|
| 59 |
verifier: Optional[Verifier] = None,
|
| 60 |
repair: Optional[Repair] = None,
|
| 61 |
context_engineer: ContextEngineer | None = None,
|
| 62 |
+
metrics: Metrics | None = None,
|
| 63 |
):
|
| 64 |
self.detector = detector
|
| 65 |
self.planner = planner
|
|
|
|
| 71 |
# If the verifier explicitly requires verification, enforce it in finalize.
|
| 72 |
self.require_verification = bool(getattr(self.verifier, "required", False))
|
| 73 |
self.context_engineer = context_engineer
|
| 74 |
+
self.metrics: Metrics = metrics or NoOpMetrics()
|
| 75 |
|
| 76 |
# ---------------------------- helpers ----------------------------
|
| 77 |
@staticmethod
|
|
|
|
| 206 |
r = self._safe_stage(fn, **kwargs)
|
| 207 |
dt = (time.perf_counter() - t0) * 1000.0
|
| 208 |
|
| 209 |
+
self.metrics.observe_stage_duration_ms(stage=stage_name, dt_ms=dt)
|
| 210 |
|
| 211 |
# attach stage trace
|
| 212 |
if getattr(r, "trace", None):
|
|
|
|
| 227 |
# stage failed → check repair availability
|
| 228 |
eligible, reason = self._should_repair(stage_name, r)
|
| 229 |
if not eligible:
|
| 230 |
+
self.metrics.inc_repair_attempt(outcome="skipped")
|
| 231 |
# annotate latest stage trace entry
|
| 232 |
if traces and isinstance(traces[-1], dict):
|
| 233 |
notes = traces[-1].get("notes") or {}
|
|
|
|
| 246 |
repair_args = repair_input_builder(r, kwargs)
|
| 247 |
|
| 248 |
# --- 3) Run repair (always logged) ---
|
| 249 |
+
self.metrics.inc_repair_attempt(outcome="attempt")
|
| 250 |
t1 = time.perf_counter()
|
| 251 |
r_fix = self._safe_stage(self.repair.run, **repair_args)
|
| 252 |
dt_fix = (time.perf_counter() - t1) * 1000.0
|
| 253 |
|
| 254 |
+
self.metrics.observe_stage_duration_ms(stage="repair", dt_ms=dt_fix)
|
| 255 |
|
| 256 |
if getattr(r_fix, "trace", None):
|
| 257 |
traces.append(r_fix.trace.__dict__)
|
|
|
|
| 266 |
)
|
| 267 |
|
| 268 |
if not r_fix.ok:
|
| 269 |
+
self.metrics.inc_repair_attempt(outcome="failed")
|
| 270 |
return r # repair itself failed → stop here
|
| 271 |
|
| 272 |
# --- 4) Only inject SQL if the stage is an SQL-producing stage ---
|
|
|
|
| 276 |
|
| 277 |
# important: success metric must reflect if repair was applied meaningfully
|
| 278 |
if stage_name in self.SQL_REPAIR_STAGES:
|
| 279 |
+
self.metrics.inc_repair_attempt(outcome="success")
|
| 280 |
else:
|
| 281 |
# log-only mode counts as a success-attempt but not semantic success
|
| 282 |
+
self.metrics.inc_repair_attempt(outcome="success")
|
| 283 |
|
| 284 |
# for SQL stages, we re-run the stage again with modified kwargs
|
| 285 |
# for log-only stages, this simply loops and stage is re-run unchanged
|
|
|
|
| 386 |
questions = self.detector.detect(user_query, schema_preview)
|
| 387 |
dt = (time.perf_counter() - t0) * 1000.0
|
| 388 |
is_amb = bool(questions)
|
| 389 |
+
self.metrics.observe_stage_duration_ms(stage="detector", dt_ms=dt)
|
| 390 |
traces.append(
|
| 391 |
self._mk_trace(
|
| 392 |
stage="detector",
|
|
|
|
| 396 |
)
|
| 397 |
)
|
| 398 |
if questions:
|
| 399 |
+
self.metrics.inc_pipeline_run(status="ambiguous")
|
| 400 |
return FinalResult(
|
| 401 |
ok=True,
|
| 402 |
ambiguous=True,
|
|
|
|
| 431 |
**planner_kwargs,
|
| 432 |
)
|
| 433 |
if not r_plan.ok:
|
| 434 |
+
self.metrics.inc_pipeline_run(status="error")
|
| 435 |
return FinalResult(
|
| 436 |
ok=False,
|
| 437 |
ambiguous=False,
|
|
|
|
| 469 |
**gen_kwargs,
|
| 470 |
)
|
| 471 |
if not r_gen.ok:
|
| 472 |
+
self.metrics.inc_pipeline_run(status="error")
|
| 473 |
return FinalResult(
|
| 474 |
ok=False,
|
| 475 |
ambiguous=False,
|
|
|
|
| 515 |
|
| 516 |
# Guard: empty SQL
|
| 517 |
if not sql or not str(sql).strip():
|
| 518 |
+
self.metrics.inc_pipeline_run(status="error")
|
| 519 |
traces.append(
|
| 520 |
self._mk_trace("generator", 0.0, "failed", {"reason": "empty_sql"})
|
| 521 |
)
|
|
|
|
| 543 |
traces=traces,
|
| 544 |
)
|
| 545 |
if not r_safe.ok:
|
| 546 |
+
self.metrics.inc_pipeline_run(status="error")
|
| 547 |
return FinalResult(
|
| 548 |
ok=False,
|
| 549 |
ambiguous=False,
|
|
|
|
| 591 |
traces=traces,
|
| 592 |
)
|
| 593 |
dt = (time.perf_counter() - t0) * 1000.0
|
| 594 |
+
self.metrics.observe_stage_duration_ms(stage="verifier", dt_ms=dt)
|
| 595 |
|
| 596 |
# Attach a trace entry if verifier didn't provide one
|
| 597 |
if getattr(r_ver, "trace", None):
|
|
|
|
| 651 |
"schema_preview": schema_for_llm,
|
| 652 |
}
|
| 653 |
|
| 654 |
+
self.metrics.inc_repair_attempt(outcome="attempt")
|
| 655 |
r_rep = self.repair.run(**rep_kwargs)
|
| 656 |
|
| 657 |
new_sql = (
|
|
|
|
| 673 |
traces=traces,
|
| 674 |
)
|
| 675 |
if not r_safe2.ok:
|
| 676 |
+
self.metrics.inc_pipeline_run(status="error")
|
| 677 |
return FinalResult(
|
| 678 |
ok=False,
|
| 679 |
ambiguous=False,
|
|
|
|
| 713 |
verified = bool(data2.get("verified") is True)
|
| 714 |
|
| 715 |
if verified:
|
| 716 |
+
self.metrics.inc_repair_attempt(outcome="success")
|
| 717 |
else:
|
| 718 |
+
self.metrics.inc_repair_attempt(outcome="failed")
|
| 719 |
else:
|
| 720 |
+
self.metrics.inc_repair_attempt(outcome="skipped")
|
| 721 |
|
| 722 |
# --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
|
| 723 |
if (verified is None or not verified) and not details:
|
|
|
|
| 756 |
else:
|
| 757 |
verified_final = bool(verified)
|
| 758 |
|
| 759 |
+
self.metrics.inc_pipeline_run(status=("ok" if ok else "error"))
|
| 760 |
|
| 761 |
traces.append(
|
| 762 |
self._mk_trace(
|
|
|
|
| 785 |
)
|
| 786 |
|
| 787 |
except Exception:
|
| 788 |
+
self.metrics.inc_pipeline_run(status="error")
|
| 789 |
# bubble up to make failures visible in tests and logs
|
| 790 |
raise
|
| 791 |
|
| 792 |
finally:
|
| 793 |
# Always record total latency, even on early return/exception
|
| 794 |
+
self.metrics.observe_stage_duration_ms(
|
| 795 |
+
stage="pipeline_total", dt_ms=(time.perf_counter() - t_all0) * 1000.0
|
| 796 |
)
|
nl2sql/pipeline_factory.py
CHANGED
|
@@ -12,6 +12,9 @@ except Exception:
|
|
| 12 |
pass
|
| 13 |
|
| 14 |
from nl2sql.pipeline import Pipeline
|
|
|
|
|
|
|
|
|
|
| 15 |
from nl2sql.registry import (
|
| 16 |
DETECTORS,
|
| 17 |
PLANNERS,
|
|
@@ -67,6 +70,13 @@ def _is_pytest() -> bool:
|
|
| 67 |
return bool(os.getenv("PYTEST_CURRENT_TEST"))
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def _tr(
|
| 71 |
stage: str,
|
| 72 |
*,
|
|
@@ -214,6 +224,7 @@ def pipeline_from_config(path: str) -> Pipeline:
|
|
| 214 |
verifier=verifier,
|
| 215 |
repair=repair,
|
| 216 |
context_engineer=context_engineer,
|
|
|
|
| 217 |
)
|
| 218 |
|
| 219 |
|
|
@@ -327,4 +338,5 @@ def pipeline_from_config_with_adapter(path: str, *, adapter: DBAdapter) -> Pipel
|
|
| 327 |
executor=executor,
|
| 328 |
verifier=verifier,
|
| 329 |
repair=repair,
|
|
|
|
| 330 |
)
|
|
|
|
| 12 |
pass
|
| 13 |
|
| 14 |
from nl2sql.pipeline import Pipeline
|
| 15 |
+
from adapters.metrics.base import Metrics
|
| 16 |
+
from adapters.metrics.noop import NoOpMetrics
|
| 17 |
+
from adapters.metrics.prometheus import PrometheusMetrics
|
| 18 |
from nl2sql.registry import (
|
| 19 |
DETECTORS,
|
| 20 |
PLANNERS,
|
|
|
|
| 70 |
return bool(os.getenv("PYTEST_CURRENT_TEST"))
|
| 71 |
|
| 72 |
|
| 73 |
+
def _make_metrics() -> Metrics:
|
| 74 |
+
# Under pytest, keep metrics side-effect free.
|
| 75 |
+
if _is_pytest():
|
| 76 |
+
return NoOpMetrics()
|
| 77 |
+
return PrometheusMetrics()
|
| 78 |
+
|
| 79 |
+
|
| 80 |
def _tr(
|
| 81 |
stage: str,
|
| 82 |
*,
|
|
|
|
| 224 |
verifier=verifier,
|
| 225 |
repair=repair,
|
| 226 |
context_engineer=context_engineer,
|
| 227 |
+
metrics=_make_metrics(),
|
| 228 |
)
|
| 229 |
|
| 230 |
|
|
|
|
| 338 |
executor=executor,
|
| 339 |
verifier=verifier,
|
| 340 |
repair=repair,
|
| 341 |
+
metrics=_make_metrics(),
|
| 342 |
)
|