Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

github-actions[bot] commited on Dec 21, 2025

Commit

11975fd

1 Parent(s): 562f213

Sync from GitHub main @ 0ffdf76846d3f126cc49a0fd3341046141b13f7d

Browse files

Files changed (5) hide show

adapters/metrics/base.py +18 -3
adapters/metrics/noop.py +11 -2
adapters/metrics/prometheus.py +40 -6
nl2sql/pipeline.py +22 -9
nl2sql/repair.py +7 -3

adapters/metrics/base.py CHANGED Viewed

@@ -1,11 +1,26 @@
 from __future__ import annotations
-from typing import Protocol
-class Metrics(Protocol):
     def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None: ...
     def inc_pipeline_run(self, *, status: str) -> None: ...
-    def inc_repair_attempt(self, *, outcome: str) -> None: ...

 from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Literal
+RepairOutcome = Literal["attempt", "success", "failed", "skipped"]
+class Metrics(ABC):
+    @abstractmethod
     def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None: ...
+    @abstractmethod
     def inc_pipeline_run(self, *, status: str) -> None: ...
+    @abstractmethod
+    def inc_stage_call(self, *, stage: str, ok: bool) -> None: ...
+    @abstractmethod
+    def inc_stage_error(self, *, stage: str, error_code: str) -> None: ...
+    @abstractmethod
+    def inc_repair_trigger(self, *, stage: str, reason: str) -> None: ...
+    @abstractmethod
+    def inc_repair_attempt(self, *, stage: str, outcome: RepairOutcome) -> None: ...

adapters/metrics/noop.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from adapters.metrics.base import Metrics
 class NoOpMetrics(Metrics):
@@ -10,5 +10,14 @@ class NoOpMetrics(Metrics):
     def inc_pipeline_run(self, *, status: str) -> None:
         return
-    def inc_repair_attempt(self, *, outcome: str) -> None:
         return

 from __future__ import annotations
+from adapters.metrics.base import Metrics, RepairOutcome
 class NoOpMetrics(Metrics):
     def inc_pipeline_run(self, *, status: str) -> None:
         return
+    def inc_stage_call(self, *, stage: str, ok: bool) -> None:
+        return
+    def inc_stage_error(self, *, stage: str, error_code: str) -> None:
+        return
+    def inc_repair_trigger(self, *, stage: str, reason: str) -> None:
+        return
+    def inc_repair_attempt(self, *, stage: str, outcome: RepairOutcome) -> None:
         return

adapters/metrics/prometheus.py CHANGED Viewed

@@ -1,16 +1,50 @@
 from __future__ import annotations
-from adapters.metrics.base import Metrics
-from nl2sql.metrics import stage_duration_ms, pipeline_runs_total, repair_attempts_total
 class PrometheusMetrics(Metrics):
     def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
-        stage_duration_ms.labels(stage).observe(dt_ms)
     def inc_pipeline_run(self, *, status: str) -> None:
         pipeline_runs_total.labels(status=status).inc()
-    def inc_repair_attempt(self, *, outcome: str) -> None:
-        # outcome: attempt | success | failed | skipped
-        repair_attempts_total.labels(outcome=outcome).inc()

 from __future__ import annotations
+from prometheus_client import Counter
+from adapters.metrics.base import Metrics, RepairOutcome
+from nl2sql.metrics import stage_duration_ms, pipeline_runs_total
+stage_calls_total = Counter(
+    "stage_calls_total",
+    "Total number of stage calls by stage and success",
+    ["stage", "ok"],
+)
+stage_errors_total = Counter(
+    "stage_errors_total",
+    "Total number of stage errors by stage and error code",
+    ["stage", "error_code"],
+)
+repair_attempts_total = Counter(
+    "repair_attempts_total",
+    "Total repair attempts by stage and outcome",
+    ["stage", "outcome"],
+)
+repair_trigger_total = Counter(
+    "repair_trigger_total",
+    "Total repair triggers by stage and reason",
+    ["stage", "reason"],
+)
 class PrometheusMetrics(Metrics):
     def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
+        stage_duration_ms.labels(stage=stage).observe(dt_ms)
     def inc_pipeline_run(self, *, status: str) -> None:
         pipeline_runs_total.labels(status=status).inc()
+    def inc_stage_call(self, *, stage: str, ok: bool) -> None:
+        stage_calls_total.labels(stage=stage, ok=str(ok).lower()).inc()
+    def inc_stage_error(self, *, stage: str, error_code: str) -> None:
+        stage_errors_total.labels(stage=stage, error_code=error_code).inc()
+    def inc_repair_trigger(self, *, stage: str, reason: str) -> None:
+        repair_trigger_total.labels(stage=stage, reason=reason).inc()
+    def inc_repair_attempt(self, *, stage: str, outcome: RepairOutcome) -> None:
+        repair_attempts_total.labels(stage=stage, outcome=outcome).inc()

nl2sql/pipeline.py CHANGED Viewed

@@ -208,6 +208,13 @@ class Pipeline:
             self.metrics.observe_stage_duration_ms(stage=stage_name, dt_ms=dt)
             # attach stage trace
             if getattr(r, "trace", None):
                 traces.append(r.trace.__dict__)
@@ -227,7 +234,7 @@ class Pipeline:
             # stage failed → check repair availability
             eligible, reason = self._should_repair(stage_name, r)
             if not eligible:
-                self.metrics.inc_repair_attempt(outcome="skipped")
                 # annotate latest stage trace entry
                 if traces and isinstance(traces[-1], dict):
                     notes = traces[-1].get("notes") or {}
@@ -246,7 +253,8 @@ class Pipeline:
             repair_args = repair_input_builder(r, kwargs)
             # --- 3) Run repair (always logged) ---
-            self.metrics.inc_repair_attempt(outcome="attempt")
             t1 = time.perf_counter()
             r_fix = self._safe_stage(self.repair.run, **repair_args)
             dt_fix = (time.perf_counter() - t1) * 1000.0
@@ -266,7 +274,7 @@ class Pipeline:
                 )
             if not r_fix.ok:
-                self.metrics.inc_repair_attempt(outcome="failed")
                 return r  # repair itself failed → stop here
             # --- 4) Only inject SQL if the stage is an SQL-producing stage ---
@@ -276,10 +284,10 @@ class Pipeline:
             # important: success metric must reflect if repair was applied meaningfully
             if stage_name in self.SQL_REPAIR_STAGES:
-                self.metrics.inc_repair_attempt(outcome="success")
             else:
                 # log-only mode counts as a success-attempt but not semantic success
-                self.metrics.inc_repair_attempt(outcome="success")
             # for SQL stages, we re-run the stage again with modified kwargs
             # for log-only stages, this simply loops and stage is re-run unchanged
@@ -623,6 +631,7 @@ class Pipeline:
                     ),
                 )
                 if eligible:
                     # Prefer the real verifier message if present (tests expect this).
                     err_list = (r_ver.error if (r_ver and r_ver.error) else None) or []
                     error_msg = (
@@ -651,7 +660,7 @@ class Pipeline:
                             "schema_preview": schema_for_llm,
                         }
-                    self.metrics.inc_repair_attempt(outcome="attempt")
                     r_rep = self.repair.run(**rep_kwargs)
                     new_sql = (
@@ -713,11 +722,15 @@ class Pipeline:
                         verified = bool(data2.get("verified") is True)
                         if verified:
-                            self.metrics.inc_repair_attempt(outcome="success")
                         else:
-                            self.metrics.inc_repair_attempt(outcome="failed")
                 else:
-                    self.metrics.inc_repair_attempt(outcome="skipped")
             # --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
             if (verified is None or not verified) and not details:

             self.metrics.observe_stage_duration_ms(stage=stage_name, dt_ms=dt)
+            self.metrics.inc_stage_call(stage=stage_name, ok=r.ok)
+            if not r.ok and getattr(r, "error_code", None) is not None:
+                self.metrics.inc_stage_error(
+                    stage=stage_name,
+                    error_code=str(r.error_code),
+                )
             # attach stage trace
             if getattr(r, "trace", None):
                 traces.append(r.trace.__dict__)
             # stage failed → check repair availability
             eligible, reason = self._should_repair(stage_name, r)
             if not eligible:
+                self.metrics.inc_repair_attempt(stage="verifier", outcome="skipped")
                 # annotate latest stage trace entry
                 if traces and isinstance(traces[-1], dict):
                     notes = traces[-1].get("notes") or {}
             repair_args = repair_input_builder(r, kwargs)
             # --- 3) Run repair (always logged) ---
+            self.metrics.inc_repair_trigger(stage=stage_name, reason=reason)
+            self.metrics.inc_repair_attempt(stage="verifier", outcome="attempt")
             t1 = time.perf_counter()
             r_fix = self._safe_stage(self.repair.run, **repair_args)
             dt_fix = (time.perf_counter() - t1) * 1000.0
                 )
             if not r_fix.ok:
+                self.metrics.inc_repair_attempt(stage="verifier", outcome="failed")
                 return r  # repair itself failed → stop here
             # --- 4) Only inject SQL if the stage is an SQL-producing stage ---
             # important: success metric must reflect if repair was applied meaningfully
             if stage_name in self.SQL_REPAIR_STAGES:
+                self.metrics.inc_repair_attempt(stage="verifier", outcome="success")
             else:
                 # log-only mode counts as a success-attempt but not semantic success
+                self.metrics.inc_repair_attempt(stage="verifier", outcome="success")
             # for SQL stages, we re-run the stage again with modified kwargs
             # for log-only stages, this simply loops and stage is re-run unchanged
                     ),
                 )
                 if eligible:
+                    self.metrics.inc_repair_trigger(stage="verifier", reason=_reason)
                     # Prefer the real verifier message if present (tests expect this).
                     err_list = (r_ver.error if (r_ver and r_ver.error) else None) or []
                     error_msg = (
                             "schema_preview": schema_for_llm,
                         }
+                    self.metrics.inc_repair_attempt(stage="verifier", outcome="attempt")
                     r_rep = self.repair.run(**rep_kwargs)
                     new_sql = (
                         verified = bool(data2.get("verified") is True)
                         if verified:
+                            self.metrics.inc_repair_attempt(
+                                stage="verifier", outcome="success"
+                            )
                         else:
+                            self.metrics.inc_repair_attempt(
+                                stage="verifier", outcome="failed"
+                            )
                 else:
+                    self.metrics.inc_repair_attempt(stage="verifier", outcome="skipped")
             # --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
             if (verified is None or not verified) and not details:

nl2sql/repair.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import time
 from nl2sql.types import StageTrace, StageResult
 from adapters.llm.base import LLMProvider
@@ -21,7 +20,7 @@ class Repair:
     def __init__(self, llm: LLMProvider):
         self.llm = llm
-    def run(self, sql: str, error_msg: str, schema_preview: str) -> StageResult:
         t0 = time.perf_counter()
         fixed_sql, t_in, t_out, cost = self.llm.repair(
             sql=sql,
@@ -36,4 +35,9 @@ class Repair:
             cost_usd=cost,
             notes={"old_sql_len": len(sql), "new_sql_len": len(fixed_sql)},
         )
-        return StageResult(ok=True, data={"sql": fixed_sql}, trace=trace)

 import time
 from nl2sql.types import StageTrace, StageResult
 from adapters.llm.base import LLMProvider
     def __init__(self, llm: LLMProvider):
         self.llm = llm
+    def run(self, *, sql: str, error_msg: str, schema_preview: str) -> StageResult:
         t0 = time.perf_counter()
         fixed_sql, t_in, t_out, cost = self.llm.repair(
             sql=sql,
             cost_usd=cost,
             notes={"old_sql_len": len(sql), "new_sql_len": len(fixed_sql)},
         )
+        return StageResult(
+            ok=True,
+            data={"sql": fixed_sql},
+            trace=trace,
+        )