Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

github-actions[bot] commited on Dec 21, 2025

Commit

562f213

1 Parent(s): c743376

Sync from GitHub main @ d3788163c1b28737c76fe3930fe9f123a0a2d084

Browse files

Files changed (6) hide show

adapters/metrics/__init__.py +0 -0
adapters/metrics/base.py +11 -0
adapters/metrics/noop.py +14 -0
adapters/metrics/prometheus.py +16 -0
nl2sql/pipeline.py +27 -24
nl2sql/pipeline_factory.py +12 -0

adapters/metrics/__init__.py ADDED Viewed

File without changes

adapters/metrics/base.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from __future__ import annotations
+from typing import Protocol
+class Metrics(Protocol):
+    def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None: ...
+    def inc_pipeline_run(self, *, status: str) -> None: ...
+    def inc_repair_attempt(self, *, outcome: str) -> None: ...

adapters/metrics/noop.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from __future__ import annotations
+from adapters.metrics.base import Metrics
+class NoOpMetrics(Metrics):
+    def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
+        return
+    def inc_pipeline_run(self, *, status: str) -> None:
+        return
+    def inc_repair_attempt(self, *, outcome: str) -> None:
+        return

adapters/metrics/prometheus.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from __future__ import annotations
+from adapters.metrics.base import Metrics
+from nl2sql.metrics import stage_duration_ms, pipeline_runs_total, repair_attempts_total
+class PrometheusMetrics(Metrics):
+    def observe_stage_duration_ms(self, *, stage: str, dt_ms: float) -> None:
+        stage_duration_ms.labels(stage).observe(dt_ms)
+    def inc_pipeline_run(self, *, status: str) -> None:
+        pipeline_runs_total.labels(status=status).inc()
+    def inc_repair_attempt(self, *, outcome: str) -> None:
+        # outcome: attempt | success | failed | skipped
+        repair_attempts_total.labels(outcome=outcome).inc()

nl2sql/pipeline.py CHANGED Viewed

@@ -15,7 +15,8 @@ from nl2sql.executor import Executor
 from nl2sql.verifier import Verifier
 from nl2sql.repair import Repair
 from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
-from nl2sql.metrics import stage_duration_ms, pipeline_runs_total, repair_attempts_total
 from nl2sql.errors.codes import ErrorCode
 from nl2sql.context_engineering.render import render_schema_pack
 from nl2sql.context_engineering.engineer import ContextEngineer
@@ -58,6 +59,7 @@ class Pipeline:
         verifier: Optional[Verifier] = None,
         repair: Optional[Repair] = None,
         context_engineer: ContextEngineer | None = None,
     ):
         self.detector = detector
         self.planner = planner
@@ -69,6 +71,7 @@ class Pipeline:
         # If the verifier explicitly requires verification, enforce it in finalize.
         self.require_verification = bool(getattr(self.verifier, "required", False))
         self.context_engineer = context_engineer
     # ---------------------------- helpers ----------------------------
     @staticmethod
@@ -203,7 +206,7 @@ class Pipeline:
             r = self._safe_stage(fn, **kwargs)
             dt = (time.perf_counter() - t0) * 1000.0
-            stage_duration_ms.labels(stage_name).observe(dt)
             # attach stage trace
             if getattr(r, "trace", None):
@@ -224,7 +227,7 @@ class Pipeline:
             # stage failed → check repair availability
             eligible, reason = self._should_repair(stage_name, r)
             if not eligible:
-                repair_attempts_total.labels(outcome="skipped").inc()
                 # annotate latest stage trace entry
                 if traces and isinstance(traces[-1], dict):
                     notes = traces[-1].get("notes") or {}
@@ -243,12 +246,12 @@ class Pipeline:
             repair_args = repair_input_builder(r, kwargs)
             # --- 3) Run repair (always logged) ---
-            repair_attempts_total.labels(outcome="attempt").inc()
             t1 = time.perf_counter()
             r_fix = self._safe_stage(self.repair.run, **repair_args)
             dt_fix = (time.perf_counter() - t1) * 1000.0
-            stage_duration_ms.labels("repair").observe(dt_fix)
             if getattr(r_fix, "trace", None):
                 traces.append(r_fix.trace.__dict__)
@@ -263,7 +266,7 @@ class Pipeline:
                 )
             if not r_fix.ok:
-                repair_attempts_total.labels(outcome="failed").inc()
                 return r  # repair itself failed → stop here
             # --- 4) Only inject SQL if the stage is an SQL-producing stage ---
@@ -273,10 +276,10 @@ class Pipeline:
             # important: success metric must reflect if repair was applied meaningfully
             if stage_name in self.SQL_REPAIR_STAGES:
-                repair_attempts_total.labels(outcome="success").inc()
             else:
                 # log-only mode counts as a success-attempt but not semantic success
-                repair_attempts_total.labels(outcome="success").inc()
             # for SQL stages, we re-run the stage again with modified kwargs
             # for log-only stages, this simply loops and stage is re-run unchanged
@@ -383,7 +386,7 @@ class Pipeline:
             questions = self.detector.detect(user_query, schema_preview)
             dt = (time.perf_counter() - t0) * 1000.0
             is_amb = bool(questions)
-            stage_duration_ms.labels("detector").observe(dt)
             traces.append(
                 self._mk_trace(
                     stage="detector",
@@ -393,7 +396,7 @@ class Pipeline:
                 )
             )
             if questions:
-                pipeline_runs_total.labels(status="ambiguous").inc()
                 return FinalResult(
                     ok=True,
                     ambiguous=True,
@@ -428,7 +431,7 @@ class Pipeline:
                 **planner_kwargs,
             )
             if not r_plan.ok:
-                pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
                     ok=False,
                     ambiguous=False,
@@ -466,7 +469,7 @@ class Pipeline:
                 **gen_kwargs,
             )
             if not r_gen.ok:
-                pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
                     ok=False,
                     ambiguous=False,
@@ -512,7 +515,7 @@ class Pipeline:
             # Guard: empty SQL
             if not sql or not str(sql).strip():
-                pipeline_runs_total.labels(status="error").inc()
                 traces.append(
                     self._mk_trace("generator", 0.0, "failed", {"reason": "empty_sql"})
                 )
@@ -540,7 +543,7 @@ class Pipeline:
                 traces=traces,
             )
             if not r_safe.ok:
-                pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
                     ok=False,
                     ambiguous=False,
@@ -588,7 +591,7 @@ class Pipeline:
                     traces=traces,
                 )
                 dt = (time.perf_counter() - t0) * 1000.0
-                stage_duration_ms.labels("verifier").observe(dt)
                 # Attach a trace entry if verifier didn't provide one
                 if getattr(r_ver, "trace", None):
@@ -648,7 +651,7 @@ class Pipeline:
                             "schema_preview": schema_for_llm,
                         }
-                    repair_attempts_total.labels(outcome="attempt").inc()
                     r_rep = self.repair.run(**rep_kwargs)
                     new_sql = (
@@ -670,7 +673,7 @@ class Pipeline:
                             traces=traces,
                         )
                         if not r_safe2.ok:
-                            pipeline_runs_total.labels(status="error").inc()
                             return FinalResult(
                                 ok=False,
                                 ambiguous=False,
@@ -710,11 +713,11 @@ class Pipeline:
                         verified = bool(data2.get("verified") is True)
                         if verified:
-                            repair_attempts_total.labels(outcome="success").inc()
                         else:
-                            repair_attempts_total.labels(outcome="failed").inc()
                 else:
-                    repair_attempts_total.labels(outcome="skipped").inc()
             # --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
             if (verified is None or not verified) and not details:
@@ -753,7 +756,7 @@ class Pipeline:
             else:
                 verified_final = bool(verified)
-            pipeline_runs_total.labels(status=("ok" if ok else "error")).inc()
             traces.append(
                 self._mk_trace(
@@ -782,12 +785,12 @@ class Pipeline:
             )
         except Exception:
-            pipeline_runs_total.labels(status="error").inc()
             # bubble up to make failures visible in tests and logs
             raise
         finally:
             # Always record total latency, even on early return/exception
-            stage_duration_ms.labels("pipeline_total").observe(
-                (time.perf_counter() - t_all0) * 1000.0
             )

 from nl2sql.verifier import Verifier
 from nl2sql.repair import Repair
 from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
+from adapters.metrics.base import Metrics
+from adapters.metrics.noop import NoOpMetrics
 from nl2sql.errors.codes import ErrorCode
 from nl2sql.context_engineering.render import render_schema_pack
 from nl2sql.context_engineering.engineer import ContextEngineer
         verifier: Optional[Verifier] = None,
         repair: Optional[Repair] = None,
         context_engineer: ContextEngineer | None = None,
+        metrics: Metrics | None = None,
     ):
         self.detector = detector
         self.planner = planner
         # If the verifier explicitly requires verification, enforce it in finalize.
         self.require_verification = bool(getattr(self.verifier, "required", False))
         self.context_engineer = context_engineer
+        self.metrics: Metrics = metrics or NoOpMetrics()
     # ---------------------------- helpers ----------------------------
     @staticmethod
             r = self._safe_stage(fn, **kwargs)
             dt = (time.perf_counter() - t0) * 1000.0
+            self.metrics.observe_stage_duration_ms(stage=stage_name, dt_ms=dt)
             # attach stage trace
             if getattr(r, "trace", None):
             # stage failed → check repair availability
             eligible, reason = self._should_repair(stage_name, r)
             if not eligible:
+                self.metrics.inc_repair_attempt(outcome="skipped")
                 # annotate latest stage trace entry
                 if traces and isinstance(traces[-1], dict):
                     notes = traces[-1].get("notes") or {}
             repair_args = repair_input_builder(r, kwargs)
             # --- 3) Run repair (always logged) ---
+            self.metrics.inc_repair_attempt(outcome="attempt")
             t1 = time.perf_counter()
             r_fix = self._safe_stage(self.repair.run, **repair_args)
             dt_fix = (time.perf_counter() - t1) * 1000.0
+            self.metrics.observe_stage_duration_ms(stage="repair", dt_ms=dt_fix)
             if getattr(r_fix, "trace", None):
                 traces.append(r_fix.trace.__dict__)
                 )
             if not r_fix.ok:
+                self.metrics.inc_repair_attempt(outcome="failed")
                 return r  # repair itself failed → stop here
             # --- 4) Only inject SQL if the stage is an SQL-producing stage ---
             # important: success metric must reflect if repair was applied meaningfully
             if stage_name in self.SQL_REPAIR_STAGES:
+                self.metrics.inc_repair_attempt(outcome="success")
             else:
                 # log-only mode counts as a success-attempt but not semantic success
+                self.metrics.inc_repair_attempt(outcome="success")
             # for SQL stages, we re-run the stage again with modified kwargs
             # for log-only stages, this simply loops and stage is re-run unchanged
             questions = self.detector.detect(user_query, schema_preview)
             dt = (time.perf_counter() - t0) * 1000.0
             is_amb = bool(questions)
+            self.metrics.observe_stage_duration_ms(stage="detector", dt_ms=dt)
             traces.append(
                 self._mk_trace(
                     stage="detector",
                 )
             )
             if questions:
+                self.metrics.inc_pipeline_run(status="ambiguous")
                 return FinalResult(
                     ok=True,
                     ambiguous=True,
                 **planner_kwargs,
             )
             if not r_plan.ok:
+                self.metrics.inc_pipeline_run(status="error")
                 return FinalResult(
                     ok=False,
                     ambiguous=False,
                 **gen_kwargs,
             )
             if not r_gen.ok:
+                self.metrics.inc_pipeline_run(status="error")
                 return FinalResult(
                     ok=False,
                     ambiguous=False,
             # Guard: empty SQL
             if not sql or not str(sql).strip():
+                self.metrics.inc_pipeline_run(status="error")
                 traces.append(
                     self._mk_trace("generator", 0.0, "failed", {"reason": "empty_sql"})
                 )
                 traces=traces,
             )
             if not r_safe.ok:
+                self.metrics.inc_pipeline_run(status="error")
                 return FinalResult(
                     ok=False,
                     ambiguous=False,
                     traces=traces,
                 )
                 dt = (time.perf_counter() - t0) * 1000.0
+                self.metrics.observe_stage_duration_ms(stage="verifier", dt_ms=dt)
                 # Attach a trace entry if verifier didn't provide one
                 if getattr(r_ver, "trace", None):
                             "schema_preview": schema_for_llm,
                         }
+                    self.metrics.inc_repair_attempt(outcome="attempt")
                     r_rep = self.repair.run(**rep_kwargs)
                     new_sql = (
                             traces=traces,
                         )
                         if not r_safe2.ok:
+                            self.metrics.inc_pipeline_run(status="error")
                             return FinalResult(
                                 ok=False,
                                 ambiguous=False,
                         verified = bool(data2.get("verified") is True)
                         if verified:
+                            self.metrics.inc_repair_attempt(outcome="success")
                         else:
+                            self.metrics.inc_repair_attempt(outcome="failed")
                 else:
+                    self.metrics.inc_repair_attempt(outcome="skipped")
             # --- 8) optional soft auto-verify (executor success, no details) --- (executor success, no details) ---
             if (verified is None or not verified) and not details:
             else:
                 verified_final = bool(verified)
+            self.metrics.inc_pipeline_run(status=("ok" if ok else "error"))
             traces.append(
                 self._mk_trace(
             )
         except Exception:
+            self.metrics.inc_pipeline_run(status="error")
             # bubble up to make failures visible in tests and logs
             raise
         finally:
             # Always record total latency, even on early return/exception
+            self.metrics.observe_stage_duration_ms(
+                stage="pipeline_total", dt_ms=(time.perf_counter() - t_all0) * 1000.0
             )

nl2sql/pipeline_factory.py CHANGED Viewed

@@ -12,6 +12,9 @@ except Exception:
     pass
 from nl2sql.pipeline import Pipeline
 from nl2sql.registry import (
     DETECTORS,
     PLANNERS,
@@ -67,6 +70,13 @@ def _is_pytest() -> bool:
     return bool(os.getenv("PYTEST_CURRENT_TEST"))
 def _tr(
     stage: str,
     *,
@@ -214,6 +224,7 @@ def pipeline_from_config(path: str) -> Pipeline:
         verifier=verifier,
         repair=repair,
         context_engineer=context_engineer,
     )
@@ -327,4 +338,5 @@ def pipeline_from_config_with_adapter(path: str, *, adapter: DBAdapter) -> Pipel
         executor=executor,
         verifier=verifier,
         repair=repair,
     )

     pass
 from nl2sql.pipeline import Pipeline
+from adapters.metrics.base import Metrics
+from adapters.metrics.noop import NoOpMetrics
+from adapters.metrics.prometheus import PrometheusMetrics
 from nl2sql.registry import (
     DETECTORS,
     PLANNERS,
     return bool(os.getenv("PYTEST_CURRENT_TEST"))
+def _make_metrics() -> Metrics:
+    # Under pytest, keep metrics side-effect free.
+    if _is_pytest():
+        return NoOpMetrics()
+    return PrometheusMetrics()
 def _tr(
     stage: str,
     *,
         verifier=verifier,
         repair=repair,
         context_engineer=context_engineer,
+        metrics=_make_metrics(),
     )
         executor=executor,
         verifier=verifier,
         repair=repair,
+        metrics=_make_metrics(),
     )