Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Oct 28

Commit

eee3f75

1 Parent(s): a337fad

fix(types): resolve mypy errors and make pytest pass

Browse files

Files changed (3) hide show

app/schemas.py +3 -3
benchmarks/evaluate_spider.py +211 -55
benchmarks/run.py +34 -10

app/schemas.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from pydantic import BaseModel
-from typing import List, Optional, Any, Dict, Union
 class NL2SQLRequest(BaseModel):
@@ -21,7 +21,7 @@ class NL2SQLResponse(BaseModel):
     ambiguous: bool = False
     sql: Optional[str] = None
     rationale: Optional[str] = None
-    traces: List[Union[TraceModel, dict]] = []
 class ClarifyResponse(BaseModel):

+from pydantic import BaseModel, Field
+from typing import List, Optional, Any, Dict, Mapping, Sequence
 class NL2SQLRequest(BaseModel):
     ambiguous: bool = False
     sql: Optional[str] = None
     rationale: Optional[str] = None
+    traces: Sequence[TraceModel | Mapping[str, Any]] = Field(default_factory=list)
 class ClarifyResponse(BaseModel):

benchmarks/evaluate_spider.py CHANGED Viewed

@@ -1,38 +1,96 @@
 from __future__ import annotations
-import time
 import json
 import subprocess
 from pathlib import Path
-from tqdm import tqdm
-from app import get_schema_preview, on_generate_query, make_sql_chain
 from langchain_community.utilities import SQLDatabase
-from benchmarks import load_spider_sqlite
 from sqlglot import parse_one, exp
 from sqlglot.errors import ParseError
 LOG_DIR = Path("logs/spider_eval")
 LOG_DIR.mkdir(parents=True, exist_ok=True)
 def normalize_sql(sql: str) -> str:
-    # نسخه ساده؛ می‌تونی قوی‌ترش کنی با پارس + بازسازی
     return " ".join(sql.lower().strip().split())
-def compare_results(pred_rows, gold_rows):
     if pred_rows is None or gold_rows is None:
         return False
-    # اگر ترتیب مهم نیست
     return set(pred_rows) == set(gold_rows)
-def try_execute_sql(sql_db, sql, timeout: float = None):
     start = time.time()
     try:
-        rows = sql_db.run(sql)
         return rows, time.time() - start, None
     except Exception as e:
         return None, time.time() - start, str(e)
@@ -44,7 +102,7 @@ def exact_match_structural(sql_pred: str, sql_gold: str) -> bool:
     except Exception:
         return False
-    def normalize_ast(node: exp.Expression):
         for name, arg in node.args.items():
             if isinstance(arg, list):
                 arg.sort(key=lambda x: str(x))
@@ -73,19 +131,7 @@ def get_git_commit_hash() -> str:
         return "UNKNOWN"
-FORBIDDEN_NODES = (
-    exp.Insert,
-    exp.Delete,
-    exp.Update,
-    exp.Drop,
-    exp.Alter,
-    exp.Attach,
-    exp.Pragma,
-    exp.Create,
-)
-def is_safe_sql(sql: str, dialect: str | None = None) -> bool:
     try:
         ast = parse_one(sql, read=dialect)
     except ParseError:
@@ -98,7 +144,104 @@ def is_safe_sql(sql: str, dialect: str | None = None) -> bool:
     return True
-def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
     data = load_spider_sqlite(split)
     if len(data) < limit:
         limit = len(data)
@@ -113,7 +256,7 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
     results_fn = LOG_DIR / f"{split}_results_{start_ts}.jsonl"
     metrics_fn = LOG_DIR / f"{split}_metrics_{start_ts}.json"
-    done = set()
     if resume and results_fn.exists():
         with results_fn.open("r", encoding="utf-8") as f:
             for line in f:
@@ -126,6 +269,8 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
                     pass
     write_header = not results_fn.exists()
     with (
         results_fn.open("a", encoding="utf-8") as fout,
         pred_txt.open("a", encoding="utf-8") as fpred,
@@ -141,25 +286,48 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
             fout.write("# " + json.dumps(header, ensure_ascii=False) + "\n")
             fout.flush()
-        agg = []
         for ex in tqdm(data):
             key = (ex.db_id, ex.question)
             if resume and key in done:
                 continue
             db_path = str(ex.db_path)
-            schema = get_schema_preview(db_path, 0)
             sql_db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
-            chain = make_sql_chain(sql_db)
-            state = {
-                "db_path": db_path,
-                "sql_db": sql_db,
-                "schema_text": schema,
-                "chain": chain,
-            }
             t0 = time.time()
-            msg, sql, output = on_generate_query(ex.question, 1000, state)
             gen_time = time.time() - t0
             safe_flag = is_safe_sql(sql)
@@ -197,21 +365,9 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
             gold_rows, gold_time, gold_error = try_execute_sql(sql_db, ex.gold_sql)
             skip = gold_error is not None
-            em = False
-            if not skip:
-                try:
-                    em = normalize_sql(sql) == normalize_sql(ex.gold_sql)
-                except Exception:
-                    pass
-            em_struct = False
-            if not skip:
-                em_struct = exact_match_structural(sql, ex.gold_sql)
-            exec_acc = False
-            if not skip:
-                exec_acc = compare_results(pred_rows, gold_rows)
             rec = {
                 "db_id": ex.db_id,
@@ -231,7 +387,6 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
                 "execution_accuracy": exec_acc,
                 "safe_check_failed": False,
             }
             fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
             fout.flush()
             fpred.write(f"{sql}\t{ex.db_id}\n")
@@ -246,7 +401,7 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
     valid = [
         r
         for r in agg
-        if (not r.get("safe_check_failed", False)) and r.get("gold_error") is None
     ]
     total_valid = len(valid)
     total_all = len(agg)
@@ -263,8 +418,8 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
         if (r.get("error") is not None) and (not r.get("safe_check_failed", False))
     )
     safe_fail_count = sum(1 for r in agg if r.get("safe_check_failed", False))
-    avg_gen_time = sum(r["gen_time"] for r in valid) / total_valid
-    avg_exec_time = sum(r["exec_time"] for r in valid) / total_valid
     metrics = {
         "commit_hash": commit_hash,
@@ -282,6 +437,7 @@ def run_eval(split="dev", limit=100, resume=True, sleep_time: float = 0.01):
         "run_id": start_ts,
     }
     with metrics_fn.open("w", encoding="utf-8") as fm:
         json.dump(metrics, fm, ensure_ascii=False, indent=2)

 from __future__ import annotations
 import json
 import subprocess
+import time
 from pathlib import Path
+from typing import Any, Iterable, Optional, Tuple, cast
+from tqdm import tqdm
 from langchain_community.utilities import SQLDatabase
 from sqlglot import parse_one, exp
 from sqlglot.errors import ParseError
+from sqlalchemy import create_engine, inspect
+from spider_loader import load_spider_sqlite
+def _try_import_pipeline():
+    """
+    Try multiple plausible entrypoints from nl2sql.
+    Returns a tuple of callables or None:
+      (make_pipeline | None, run_function | None, PipelineClass | None)
+    """
+    make_pipeline = None
+    run_fn = None
+    PipelineCls = None
+    try:
+        from nl2sql.pipeline import make_pipeline as _mk  # type: ignore
+        make_pipeline = _mk
+    except Exception:
+        pass
+    try:
+        from nl2sql.pipeline import run_nl2sql as _run  # type: ignore
+        run_fn = _run
+    except Exception:
+        pass
+    try:
+        from nl2sql.pipeline import Pipeline as _P  # type: ignore
+        PipelineCls = _P
+    except Exception:
+        pass
+    return make_pipeline, run_fn, PipelineCls
 LOG_DIR = Path("logs/spider_eval")
 LOG_DIR.mkdir(parents=True, exist_ok=True)
+FORBIDDEN_NODES: Tuple[type, ...] = (
+    exp.Insert,
+    exp.Delete,
+    exp.Update,
+    exp.Drop,
+    exp.Alter,
+    exp.Attach,
+    exp.Pragma,
+    exp.Create,
+)
 def normalize_sql(sql: str) -> str:
     return " ".join(sql.lower().strip().split())
+def compare_results(
+    pred_rows: Optional[Iterable[Any]], gold_rows: Optional[Iterable[Any]]
+) -> bool:
     if pred_rows is None or gold_rows is None:
         return False
     return set(pred_rows) == set(gold_rows)
+def try_execute_sql(
+    sql_db: SQLDatabase,
+    sql: str,
+    timeout: Optional[float] = None,  # kept for API compatibility
+) -> tuple[Optional[list[tuple[Any, ...]]], float, Optional[str]]:
     start = time.time()
     try:
+        raw_rows = sql_db.run(sql)
+        # Normalize result shape for MyPy and downstream code
+        if isinstance(raw_rows, list):
+            rows = [tuple(r) for r in raw_rows]
+        elif isinstance(raw_rows, tuple):
+            rows = [tuple(raw_rows)]
+        else:
+            # Fallback cast — if library returns ResultSet or something similar
+            rows = cast(list[tuple[Any, ...]], raw_rows)
         return rows, time.time() - start, None
     except Exception as e:
         return None, time.time() - start, str(e)
     except Exception:
         return False
+    def normalize_ast(node: exp.Expression) -> exp.Expression:
         for name, arg in node.args.items():
             if isinstance(arg, list):
                 arg.sort(key=lambda x: str(x))
         return "UNKNOWN"
+def is_safe_sql(sql: str, dialect: Optional[str] = None) -> bool:
     try:
         ast = parse_one(sql, read=dialect)
     except ParseError:
     return True
+# --- جایگزین get_schema_preview از app.routers ---
+def get_schema_preview_sqlalchemy(db_path: str, max_cols: int = 0) -> str:
+    """
+    Lightweight schema preview using SQLAlchemy inspector.
+    max_cols=0 => unlimited
+    """
+    engine = create_engine(f"sqlite:///{db_path}")
+    insp = inspect(engine)
+    lines: list[str] = []
+    for tbl in sorted(insp.get_table_names()):
+        cols = insp.get_columns(tbl)
+        if max_cols > 0:
+            cols = cols[:max_cols]
+        col_str = ", ".join(f"{c['name']}:{c.get('type')}" for c in cols)
+        pks = insp.get_pk_constraint(tbl).get("constrained_columns") or []
+        pk_str = f" | PK: {', '.join(pks)}" if pks else ""
+        fks = insp.get_foreign_keys(tbl)
+        fk_str = ""
+        if fks:
+            fks_desc = []
+            for fk in fks:
+                ref = fk.get("referred_table")
+                cols_fk = ", ".join(fk.get("constrained_columns") or [])
+                ref_cols = ", ".join(fk.get("referred_columns") or [])
+                fks_desc.append(f"{cols_fk} -> {ref}({ref_cols})")
+            fk_str = " | FK: " + " ; ".join(fks_desc)
+        lines.append(f"{tbl}({col_str}){pk_str}{fk_str}")
+    engine.dispose()
+    return "\n".join(lines)
+def _generate_sql(
+    question: str, sql_db: SQLDatabase, schema_text: str, max_output_tokens: int = 1000
+) -> tuple[str, str, dict[str, Any]]:
+    """
+    Returns: (status_msg, sql_text, extra_output)
+    Strategy:
+      1) If nl2sql.pipeline.run_nl2sql exists: call it.
+      2) Else if nl2sql.pipeline.make_pipeline exists: build and run.
+      3) Else if nl2sql.pipeline.Pipeline exists: instantiate minimal pipeline and run.
+      4) Else: raise NotImplementedError.
+    """
+    make_pipeline, run_fn, PipelineCls = _try_import_pipeline()
+    # Case 1: direct run function
+    if run_fn is not None:
+        res = run_fn(
+            question=question,
+            schema_text=schema_text,
+            sql_db=sql_db,
+            max_output_tokens=max_output_tokens,
+        )
+        # Expecting a dict-like or object with attributes; normalize:
+        if isinstance(res, dict):
+            msg = res.get("status", "ok")
+            sql = res.get("sql", "")
+            return msg, sql, res
+        # fallback generic
+        msg = getattr(res, "status", "ok")
+        sql = getattr(res, "sql", "")
+        return msg, sql, {"result": res}
+    # Case 2: factory + run
+    if make_pipeline is not None:
+        pipe = make_pipeline(sql_db=sql_db, schema_text=schema_text)  # type: ignore[arg-type]
+        # Common conventions:
+        if hasattr(pipe, "run"):
+            out = pipe.run(question)  # type: ignore[call-arg]
+        elif hasattr(pipe, "execute"):
+            out = pipe.execute(question)  # type: ignore[call-arg]
+        else:
+            raise RuntimeError("Pipeline object has no run/execute()")
+        msg = getattr(out, "status", "ok")
+        sql = getattr(out, "sql", "")
+        return msg, sql, {"result": out}
+    # Case 3: class-based pipeline
+    if PipelineCls is not None:
+        # Try minimal constructor names; adjust to your class signature if needed
+        # We pass what we have; extra kwargs should be ignored or have defaults.
+        pipe = PipelineCls(sql_db=sql_db, schema_text=schema_text)
+        if hasattr(pipe, "run"):
+            out = pipe.run(question)  # type: ignore[call-arg]
+        else:
+            raise RuntimeError("Pipeline class has no run()")
+        msg = getattr(out, "status", "ok")
+        sql = getattr(out, "sql", "")
+        return msg, sql, {"result": out}
+    raise NotImplementedError(
+        "Cannot locate a public NL2SQL entrypoint in nl2sql.pipeline. "
+        "Expose one of: run_nl2sql(), make_pipeline(), or Pipeline.run()."
+    )
+def run_eval(
+    split: str = "dev", limit: int = 100, resume: bool = True, sleep_time: float = 0.01
+) -> None:
     data = load_spider_sqlite(split)
     if len(data) < limit:
         limit = len(data)
     results_fn = LOG_DIR / f"{split}_results_{start_ts}.jsonl"
     metrics_fn = LOG_DIR / f"{split}_metrics_{start_ts}.json"
+    done: set[tuple[str, str]] = set()
     if resume and results_fn.exists():
         with results_fn.open("r", encoding="utf-8") as f:
             for line in f:
                     pass
     write_header = not results_fn.exists()
+    agg: list[dict[str, Any]] = []
     with (
         results_fn.open("a", encoding="utf-8") as fout,
         pred_txt.open("a", encoding="utf-8") as fpred,
             fout.write("# " + json.dumps(header, ensure_ascii=False) + "\n")
             fout.flush()
         for ex in tqdm(data):
             key = (ex.db_id, ex.question)
             if resume and key in done:
                 continue
             db_path = str(ex.db_path)
+            schema = get_schema_preview_sqlalchemy(db_path, max_cols=0)
             sql_db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
             t0 = time.time()
+            try:
+                msg, sql, output = _generate_sql(
+                    ex.question, sql_db, schema, max_output_tokens=1000
+                )
+            except NotImplementedError as e:
+                rec = {
+                    "db_id": ex.db_id,
+                    "question": ex.question,
+                    "gold_sql": ex.gold_sql,
+                    "pred_sql": "",
+                    "status": "no_entrypoint",
+                    "output": {"error": str(e)},
+                    "gen_time": time.time() - t0,
+                    "exec_time": None,
+                    "error": "no_entrypoint",
+                    "gold_error": None,
+                    "pred_rows": None,
+                    "gold_rows": None,
+                    "exact_match": False,
+                    "exact_match_structural": False,
+                    "execution_accuracy": False,
+                    "safe_check_failed": True,
+                }
+                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
+                fout.flush()
+                fgold.write(f"{ex.gold_sql}\t{ex.db_id}\n")
+                fgold.flush()
+                agg.append(rec)
+                if sleep_time > 0:
+                    time.sleep(sleep_time)
+                continue
             gen_time = time.time() - t0
             safe_flag = is_safe_sql(sql)
             gold_rows, gold_time, gold_error = try_execute_sql(sql_db, ex.gold_sql)
             skip = gold_error is not None
+            em = normalize_sql(sql) == normalize_sql(ex.gold_sql) if not skip else False
+            em_struct = exact_match_structural(sql, ex.gold_sql) if not skip else False
+            exec_acc = compare_results(pred_rows, gold_rows) if not skip else False
             rec = {
                 "db_id": ex.db_id,
                 "execution_accuracy": exec_acc,
                 "safe_check_failed": False,
             }
             fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
             fout.flush()
             fpred.write(f"{sql}\t{ex.db_id}\n")
     valid = [
         r
         for r in agg
+        if (not r.get("safe_check_failed", False)) and (r.get("gold_error") is None)
     ]
     total_valid = len(valid)
     total_all = len(agg)
         if (r.get("error") is not None) and (not r.get("safe_check_failed", False))
     )
     safe_fail_count = sum(1 for r in agg if r.get("safe_check_failed", False))
+    avg_gen_time = sum(float(r["gen_time"]) for r in valid) / total_valid
+    avg_exec_time = sum(float(r["exec_time"]) for r in valid) / total_valid
     metrics = {
         "commit_hash": commit_hash,
         "run_id": start_ts,
     }
+    metrics_fn = LOG_DIR / f"{split}_metrics_{start_ts}.json"
     with metrics_fn.open("w", encoding="utf-8") as fm:
         json.dump(metrics, fm, ensure_ascii=False, indent=2)

benchmarks/run.py CHANGED Viewed

@@ -1,11 +1,11 @@
-# benchmarks/run.py
 from __future__ import annotations
 import argparse
 import os
 import json
 import time
 from pathlib import Path
-from typing import Iterable, List, Dict, Any
 # ---- app imports
 from nl2sql.pipeline import Pipeline, FinalResult
@@ -22,11 +22,34 @@ from adapters.db.sqlite_adapter import SQLiteAdapter
 from adapters.llm.openai_provider import OpenAIProvider
-# ---- fallbacks: Dummy LLM (so it runs without API keys)
 class DummyLLM:
     provider_id = "dummy-llm"
-    def plan(self, *, user_query: str, schema_preview: str):
         text = (
             f"- understand question: {user_query}\n"
             "- identify tables\n- join if needed\n- filter\n- order/limit"
@@ -39,14 +62,14 @@ class DummyLLM:
         user_query: str,
         schema_preview: str,
         plan_text: str,
-        clarify_answers=None,
-    ):
         # naive demo SQL (so pipeline flows end-to-end)
         sql = "SELECT 1 AS one;"
         rationale = "Demo SQL from DummyLLM"
         return sql, rationale, 0, 0, 0.0
-    def repair(self, *, sql: str, error_msg: str, schema_preview: str):
         return sql, 0, 0, 0.0
@@ -73,11 +96,12 @@ def build_pipeline(db_path: Path, use_openai: bool) -> Pipeline:
     db = SQLiteAdapter(str(db_path))
     executor = Executor(db)
-    # LLM provider
     if use_openai and os.getenv("OPENAI_API_KEY"):
-        llm = OpenAIProvider()
     else:
-        llm = DummyLLM()
     # stages
     detector = AmbiguityDetector()

 from __future__ import annotations
 import argparse
 import os
 import json
 import time
 from pathlib import Path
+from typing import Iterable, List, Dict, Any, Protocol, Tuple, Optional
 # ---- app imports
 from nl2sql.pipeline import Pipeline, FinalResult
 from adapters.llm.openai_provider import OpenAIProvider
+# ---- LLM protocol (unifies OpenAIProvider and DummyLLM for mypy)
+class LLMProvider(Protocol):
+    """Minimal interface required by Planner/Generator/Repair stages."""
+    provider_id: str
+    def plan(self, *, user_query: str, schema_preview: str) -> Tuple[str, int, int, float]:
+        ...
+    def generate_sql(
+        self,
+        *,
+        user_query: str,
+        schema_preview: str,
+        plan_text: str,
+        clarify_answers: Optional[Any] = None,
+    ) -> Tuple[str, str, int, int, float]:
+        ...
+    def repair(self, *, sql: str, error_msg: str, schema_preview: str) -> Tuple[str, int, int, float]:
+        ...
+# ---- fallback: Dummy LLM (so it runs without API keys)
 class DummyLLM:
     provider_id = "dummy-llm"
+    def plan(self, *, user_query: str, schema_preview: str) -> Tuple[str, int, int, float]:
         text = (
             f"- understand question: {user_query}\n"
             "- identify tables\n- join if needed\n- filter\n- order/limit"
         user_query: str,
         schema_preview: str,
         plan_text: str,
+        clarify_answers: Optional[Any] = None,
+    ) -> Tuple[str, str, int, int, float]:
         # naive demo SQL (so pipeline flows end-to-end)
         sql = "SELECT 1 AS one;"
         rationale = "Demo SQL from DummyLLM"
         return sql, rationale, 0, 0, 0.0
+    def repair(self, *, sql: str, error_msg: str, schema_preview: str) -> Tuple[str, int, int, float]:
         return sql, 0, 0, 0.0
     db = SQLiteAdapter(str(db_path))
     executor = Executor(db)
+    # LLM provider (typed to the Protocol so mypy accepts either provider)
+    llm: LLMProvider
     if use_openai and os.getenv("OPENAI_API_KEY"):
+        llm = OpenAIProvider()  # conforms to LLMProvider
     else:
+        llm = DummyLLM()        # conforms to LLMProvider
     # stages
     detector = AmbiguityDetector()