Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
·
11975fd
1
Parent(s):
562f213
Sync from GitHub main @ 0ffdf76846d3f126cc49a0fd3341046141b13f7d
Browse files- adapters/metrics/base.py +18 -3
- adapters/metrics/noop.py +11 -2
- adapters/metrics/prometheus.py +40 -6
- nl2sql/pipeline.py +22 -9
- nl2sql/repair.py +7 -3
adapters/metrics/base.py
CHANGED
|
@@ -1,11 +1,26 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
from
|
|
|
|
| 4 |
|
|
|
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None: ...
|
| 8 |
|
|
|
|
| 9 |
def inc_pipeline_run(self, *, status: str) -> None: ...
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from typing import Literal
|
| 5 |
|
| 6 |
+
RepairOutcome = Literal["attempt", "success", "failed", "skipped"]
|
| 7 |
|
| 8 |
+
|
| 9 |
+
class Metrics(ABC):
|
| 10 |
+
@abstractmethod
|
| 11 |
def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None: ...
|
| 12 |
|
| 13 |
+
@abstractmethod
|
| 14 |
def inc_pipeline_run(self, *, status: str) -> None: ...
|
| 15 |
|
| 16 |
+
@abstractmethod
|
| 17 |
+
def inc_stage_call(self, *, stage: str, ok: bool) -> None: ...
|
| 18 |
+
|
| 19 |
+
@abstractmethod
|
| 20 |
+
def inc_stage_error(self, *, stage: str, error_code: str) -> None: ...
|
| 21 |
+
|
| 22 |
+
@abstractmethod
|
| 23 |
+
def inc_repair_trigger(self, *, stage: str, reason: str) -> None: ...
|
| 24 |
+
|
| 25 |
+
@abstractmethod
|
| 26 |
+
def inc_repair_attempt(self, *, stage: str, outcome: RepairOutcome) -> None: ...
|
adapters/metrics/noop.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
from adapters.metrics.base import Metrics
|
| 4 |
|
| 5 |
|
| 6 |
class NoOpMetrics(Metrics):
|
|
@@ -10,5 +10,14 @@ class NoOpMetrics(Metrics):
|
|
| 10 |
def inc_pipeline_run(self, *, status: str) -> None:
|
| 11 |
return
|
| 12 |
|
| 13 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
return
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
from adapters.metrics.base import Metrics, RepairOutcome
|
| 4 |
|
| 5 |
|
| 6 |
class NoOpMetrics(Metrics):
|
|
|
|
| 10 |
def inc_pipeline_run(self, *, status: str) -> None:
|
| 11 |
return
|
| 12 |
|
| 13 |
+
def inc_stage_call(self, *, stage: str, ok: bool) -> None:
|
| 14 |
+
return
|
| 15 |
+
|
| 16 |
+
def inc_stage_error(self, *, stage: str, error_code: str) -> None:
|
| 17 |
+
return
|
| 18 |
+
|
| 19 |
+
def inc_repair_trigger(self, *, stage: str, reason: str) -> None:
|
| 20 |
+
return
|
| 21 |
+
|
| 22 |
+
def inc_repair_attempt(self, *, stage: str, outcome: RepairOutcome) -> None:
|
| 23 |
return
|
adapters/metrics/prometheus.py
CHANGED
|
@@ -1,16 +1,50 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
from
|
| 4 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
class PrometheusMetrics(Metrics):
|
| 8 |
def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
|
| 9 |
-
stage_duration_ms.labels(stage).observe(dt_ms)
|
| 10 |
|
| 11 |
def inc_pipeline_run(self, *, status: str) -> None:
|
| 12 |
pipeline_runs_total.labels(status=status).inc()
|
| 13 |
|
| 14 |
-
def
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
from prometheus_client import Counter
|
| 4 |
+
from adapters.metrics.base import Metrics, RepairOutcome
|
| 5 |
+
from nl2sql.metrics import stage_duration_ms, pipeline_runs_total
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
stage_calls_total = Counter(
|
| 9 |
+
"stage_calls_total",
|
| 10 |
+
"Total number of stage calls by stage and success",
|
| 11 |
+
["stage", "ok"],
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
stage_errors_total = Counter(
|
| 15 |
+
"stage_errors_total",
|
| 16 |
+
"Total number of stage errors by stage and error code",
|
| 17 |
+
["stage", "error_code"],
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
repair_attempts_total = Counter(
|
| 21 |
+
"repair_attempts_total",
|
| 22 |
+
"Total repair attempts by stage and outcome",
|
| 23 |
+
["stage", "outcome"],
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
repair_trigger_total = Counter(
|
| 27 |
+
"repair_trigger_total",
|
| 28 |
+
"Total repair triggers by stage and reason",
|
| 29 |
+
["stage", "reason"],
|
| 30 |
+
)
|
| 31 |
|
| 32 |
|
| 33 |
class PrometheusMetrics(Metrics):
|
| 34 |
def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
|
| 35 |
+
stage_duration_ms.labels(stage=stage).observe(dt_ms)
|
| 36 |
|
| 37 |
def inc_pipeline_run(self, *, status: str) -> None:
|
| 38 |
pipeline_runs_total.labels(status=status).inc()
|
| 39 |
|
| 40 |
+
def inc_stage_call(self, *, stage: str, ok: bool) -> None:
|
| 41 |
+
stage_calls_total.labels(stage=stage, ok=str(ok).lower()).inc()
|
| 42 |
+
|
| 43 |
+
def inc_stage_error(self, *, stage: str, error_code: str) -> None:
|
| 44 |
+
stage_errors_total.labels(stage=stage, error_code=error_code).inc()
|
| 45 |
+
|
| 46 |
+
def inc_repair_trigger(self, *, stage: str, reason: str) -> None:
|
| 47 |
+
repair_trigger_total.labels(stage=stage, reason=reason).inc()
|
| 48 |
+
|
| 49 |
+
def inc_repair_attempt(self, *, stage: str, outcome: RepairOutcome) -> None:
|
| 50 |
+
repair_attempts_total.labels(stage=stage, outcome=outcome).inc()
|
nl2sql/pipeline.py
CHANGED
|
@@ -208,6 +208,13 @@ class Pipeline:
|
|
| 208 |
|
| 209 |
self.metrics.observe_stage_duration_ms(stage=stage_name, dt_ms=dt)
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
# attach stage trace
|
| 212 |
if getattr(r, "trace", None):
|
| 213 |
traces.append(r.trace.__dict__)
|
|
@@ -227,7 +234,7 @@ class Pipeline:
|
|
| 227 |
# stage failed → check repair availability
|
| 228 |
eligible, reason = self._should_repair(stage_name, r)
|
| 229 |
if not eligible:
|
| 230 |
-
self.metrics.inc_repair_attempt(outcome="skipped")
|
| 231 |
# annotate latest stage trace entry
|
| 232 |
if traces and isinstance(traces[-1], dict):
|
| 233 |
notes = traces[-1].get("notes") or {}
|
|
@@ -246,7 +253,8 @@ class Pipeline:
|
|
| 246 |
repair_args = repair_input_builder(r, kwargs)
|
| 247 |
|
| 248 |
# --- 3) Run repair (always logged) ---
|
| 249 |
-
self.metrics.
|
|
|
|
| 250 |
t1 = time.perf_counter()
|
| 251 |
r_fix = self._safe_stage(self.repair.run, **repair_args)
|
| 252 |
dt_fix = (time.perf_counter() - t1) * 1000.0
|
|
@@ -266,7 +274,7 @@ class Pipeline:
|
|
| 266 |
)
|
| 267 |
|
| 268 |
if not r_fix.ok:
|
| 269 |
-
self.metrics.inc_repair_attempt(outcome="failed")
|
| 270 |
return r # repair itself failed → stop here
|
| 271 |
|
| 272 |
# --- 4) Only inject SQL if the stage is an SQL-producing stage ---
|
|
@@ -276,10 +284,10 @@ class Pipeline:
|
|
| 276 |
|
| 277 |
# important: success metric must reflect if repair was applied meaningfully
|
| 278 |
if stage_name in self.SQL_REPAIR_STAGES:
|
| 279 |
-
self.metrics.inc_repair_attempt(outcome="success")
|
| 280 |
else:
|
| 281 |
# log-only mode counts as a success-attempt but not semantic success
|
| 282 |
-
self.metrics.inc_repair_attempt(outcome="success")
|
| 283 |
|
| 284 |
# for SQL stages, we re-run the stage again with modified kwargs
|
| 285 |
# for log-only stages, this simply loops and stage is re-run unchanged
|
|
@@ -623,6 +631,7 @@ class Pipeline:
|
|
| 623 |
),
|
| 624 |
)
|
| 625 |
if eligible:
|
|
|
|
| 626 |
# Prefer the real verifier message if present (tests expect this).
|
| 627 |
err_list = (r_ver.error if (r_ver and r_ver.error) else None) or []
|
| 628 |
error_msg = (
|
|
@@ -651,7 +660,7 @@ class Pipeline:
|
|
| 651 |
"schema_preview": schema_for_llm,
|
| 652 |
}
|
| 653 |
|
| 654 |
-
self.metrics.inc_repair_attempt(outcome="attempt")
|
| 655 |
r_rep = self.repair.run(**rep_kwargs)
|
| 656 |
|
| 657 |
new_sql = (
|
|
@@ -713,11 +722,15 @@ class Pipeline:
|
|
| 713 |
verified = bool(data2.get("verified") is True)
|
| 714 |
|
| 715 |
if verified:
|
| 716 |
-
self.metrics.inc_repair_attempt(
|
|
|
|
|
|
|
| 717 |
else:
|
| 718 |
-
self.metrics.inc_repair_attempt(
|
|
|
|
|
|
|
| 719 |
else:
|
| 720 |
-
self.metrics.inc_repair_attempt(outcome="skipped")
|
| 721 |
|
| 722 |
# --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
|
| 723 |
if (verified is None or not verified) and not details:
|
|
|
|
| 208 |
|
| 209 |
self.metrics.observe_stage_duration_ms(stage=stage_name, dt_ms=dt)
|
| 210 |
|
| 211 |
+
self.metrics.inc_stage_call(stage=stage_name, ok=r.ok)
|
| 212 |
+
if not r.ok and getattr(r, "error_code", None) is not None:
|
| 213 |
+
self.metrics.inc_stage_error(
|
| 214 |
+
stage=stage_name,
|
| 215 |
+
error_code=str(r.error_code),
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
# attach stage trace
|
| 219 |
if getattr(r, "trace", None):
|
| 220 |
traces.append(r.trace.__dict__)
|
|
|
|
| 234 |
# stage failed → check repair availability
|
| 235 |
eligible, reason = self._should_repair(stage_name, r)
|
| 236 |
if not eligible:
|
| 237 |
+
self.metrics.inc_repair_attempt(stage="verifier", outcome="skipped")
|
| 238 |
# annotate latest stage trace entry
|
| 239 |
if traces and isinstance(traces[-1], dict):
|
| 240 |
notes = traces[-1].get("notes") or {}
|
|
|
|
| 253 |
repair_args = repair_input_builder(r, kwargs)
|
| 254 |
|
| 255 |
# --- 3) Run repair (always logged) ---
|
| 256 |
+
self.metrics.inc_repair_trigger(stage=stage_name, reason=reason)
|
| 257 |
+
self.metrics.inc_repair_attempt(stage="verifier", outcome="attempt")
|
| 258 |
t1 = time.perf_counter()
|
| 259 |
r_fix = self._safe_stage(self.repair.run, **repair_args)
|
| 260 |
dt_fix = (time.perf_counter() - t1) * 1000.0
|
|
|
|
| 274 |
)
|
| 275 |
|
| 276 |
if not r_fix.ok:
|
| 277 |
+
self.metrics.inc_repair_attempt(stage="verifier", outcome="failed")
|
| 278 |
return r # repair itself failed → stop here
|
| 279 |
|
| 280 |
# --- 4) Only inject SQL if the stage is an SQL-producing stage ---
|
|
|
|
| 284 |
|
| 285 |
# important: success metric must reflect if repair was applied meaningfully
|
| 286 |
if stage_name in self.SQL_REPAIR_STAGES:
|
| 287 |
+
self.metrics.inc_repair_attempt(stage="verifier", outcome="success")
|
| 288 |
else:
|
| 289 |
# log-only mode counts as a success-attempt but not semantic success
|
| 290 |
+
self.metrics.inc_repair_attempt(stage="verifier", outcome="success")
|
| 291 |
|
| 292 |
# for SQL stages, we re-run the stage again with modified kwargs
|
| 293 |
# for log-only stages, this simply loops and stage is re-run unchanged
|
|
|
|
| 631 |
),
|
| 632 |
)
|
| 633 |
if eligible:
|
| 634 |
+
self.metrics.inc_repair_trigger(stage="verifier", reason=_reason)
|
| 635 |
# Prefer the real verifier message if present (tests expect this).
|
| 636 |
err_list = (r_ver.error if (r_ver and r_ver.error) else None) or []
|
| 637 |
error_msg = (
|
|
|
|
| 660 |
"schema_preview": schema_for_llm,
|
| 661 |
}
|
| 662 |
|
| 663 |
+
self.metrics.inc_repair_attempt(stage="verifier", outcome="attempt")
|
| 664 |
r_rep = self.repair.run(**rep_kwargs)
|
| 665 |
|
| 666 |
new_sql = (
|
|
|
|
| 722 |
verified = bool(data2.get("verified") is True)
|
| 723 |
|
| 724 |
if verified:
|
| 725 |
+
self.metrics.inc_repair_attempt(
|
| 726 |
+
stage="verifier", outcome="success"
|
| 727 |
+
)
|
| 728 |
else:
|
| 729 |
+
self.metrics.inc_repair_attempt(
|
| 730 |
+
stage="verifier", outcome="failed"
|
| 731 |
+
)
|
| 732 |
else:
|
| 733 |
+
self.metrics.inc_repair_attempt(stage="verifier", outcome="skipped")
|
| 734 |
|
| 735 |
# --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
|
| 736 |
if (verified is None or not verified) and not details:
|
nl2sql/repair.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import time
|
| 2 |
|
| 3 |
-
|
| 4 |
from nl2sql.types import StageTrace, StageResult
|
| 5 |
from adapters.llm.base import LLMProvider
|
| 6 |
|
|
@@ -21,7 +20,7 @@ class Repair:
|
|
| 21 |
def __init__(self, llm: LLMProvider):
|
| 22 |
self.llm = llm
|
| 23 |
|
| 24 |
-
def run(self, sql: str, error_msg: str, schema_preview: str) -> StageResult:
|
| 25 |
t0 = time.perf_counter()
|
| 26 |
fixed_sql, t_in, t_out, cost = self.llm.repair(
|
| 27 |
sql=sql,
|
|
@@ -36,4 +35,9 @@ class Repair:
|
|
| 36 |
cost_usd=cost,
|
| 37 |
notes={"old_sql_len": len(sql), "new_sql_len": len(fixed_sql)},
|
| 38 |
)
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
|
|
|
|
| 3 |
from nl2sql.types import StageTrace, StageResult
|
| 4 |
from adapters.llm.base import LLMProvider
|
| 5 |
|
|
|
|
| 20 |
def __init__(self, llm: LLMProvider):
|
| 21 |
self.llm = llm
|
| 22 |
|
| 23 |
+
def run(self, *, sql: str, error_msg: str, schema_preview: str) -> StageResult:
|
| 24 |
t0 = time.perf_counter()
|
| 25 |
fixed_sql, t_in, t_out, cost = self.llm.repair(
|
| 26 |
sql=sql,
|
|
|
|
| 35 |
cost_usd=cost,
|
| 36 |
notes={"old_sql_len": len(sql), "new_sql_len": len(fixed_sql)},
|
| 37 |
)
|
| 38 |
+
|
| 39 |
+
return StageResult(
|
| 40 |
+
ok=True,
|
| 41 |
+
data={"sql": fixed_sql},
|
| 42 |
+
trace=trace,
|
| 43 |
+
)
|