Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 2

Commit

370553a

1 Parent(s): c4c85f7

fix(pipeline): align backend-frontend schema and stabilize SQL flow

Browse files

Files changed (6) hide show

app/main.py +7 -0
app/routers/nl2sql.py +130 -28
app/schemas.py +5 -2
nl2sql/pipeline.py +45 -16
nl2sql/safety.py +22 -5
nl2sql/verifier.py +40 -11

app/main.py CHANGED Viewed

@@ -5,6 +5,13 @@ load_dotenv()
 from fastapi import FastAPI  # noqa: E402
 from app.routers import nl2sql  # noqa: E402
 app = FastAPI(
     title="NL2SQL Copilot Prototype",

 from fastapi import FastAPI  # noqa: E402
 from app.routers import nl2sql  # noqa: E402
+# restore previous uploaded DB map
+try:
+    from app.routers.nl2sql import _load_db_map
+    _load_db_map()
+except Exception as e:
+    print(f"⚠️ DB map not restored: {e}")
 app = FastAPI(
     title="NL2SQL Copilot Prototype",

app/routers/nl2sql.py CHANGED Viewed

@@ -14,7 +14,9 @@ from adapters.db.sqlite_adapter import SQLiteAdapter
 from adapters.db.postgres_adapter import PostgresAdapter
 import os
 import time
 import uuid
 from typing import Union, Optional, Dict
@@ -40,6 +42,41 @@ DEFAULT_SQLITE_DB = os.getenv(
     "DEFAULT_SQLITE_DB", "data/chinook.db"
 )  # keep your current default
 def _cleanup_db_map() -> None:
     """Remove expired uploaded DB files (best-effort)."""
@@ -65,24 +102,39 @@ def _resolve_sqlite_path(db_id: Optional[str]) -> str:
     return DEFAULT_SQLITE_DB
-def _select_adapter(db_id: Optional[str]) -> Union[PostgresAdapter, SQLiteAdapter]:
-    """
-    Build a DB adapter for this request.
-    - In postgres mode: always PostgresAdapter(POSTGRES_DSN).
-    - In sqlite mode: use uploaded SQLite by db_id if present, otherwise DEFAULT_SQLITE_DB.
-    """
-    if DB_MODE == "postgres":
-        if not POSTGRES_DSN:
             raise HTTPException(
-                status_code=500, detail="POSTGRES_DSN is not configured"
             )
-        return PostgresAdapter(POSTGRES_DSN)
-    # sqlite mode
-    sqlite_path = _resolve_sqlite_path(db_id)
-    # NOTE: SQLiteAdapter should open DB in read-only mode internally if supported.
-    # If not, ensure your adapter enforces PRAGMA query_only=ON and prevents DDL/DML.
-    return SQLiteAdapter(sqlite_path)
 # -------------------------------
@@ -171,6 +223,7 @@ async def upload_db(file: UploadFile = File(...)):
         raise HTTPException(status_code=500, detail=f"Failed to store DB: {e}")
     _DB_MAP[db_id] = {"path": out_path, "ts": time.time()}
     return {"db_id": db_id}
@@ -182,34 +235,53 @@ async def upload_db(file: UploadFile = File(...)):
 def nl2sql_handler(request: NL2SQLRequest):
     """
     Handle NL → SQL pipeline execution.
-    Optional: if the incoming request model supports `db_id`, we switch DB for this call.
-    Otherwise we will silently ignore and use default DB (or Postgres, based on mode).
     """
-    # Try to extract db_id if present in request (without breaking strict models)
     db_id = getattr(request, "db_id", None)  # Optional[str]
-    # Build per-request pipeline bound to the selected adapter
     adapter = _select_adapter(db_id)
     pipeline = _build_pipeline(adapter)
-    result = pipeline.run(
-        user_query=request.query,
-        schema_preview=getattr(request, "schema_preview", None),
     )
-    # Ensure result type
     if not isinstance(result, FinalResult):
         raise HTTPException(status_code=500, detail="Pipeline returned unexpected type")
-    # Ambiguity: return clarify payload
     if result.ambiguous and result.questions:
         return ClarifyResponse(ambiguous=True, questions=result.questions)
-    # Error: bubble up details
     if not result.ok or result.error:
-        detail = "; ".join(result.details or ["Unknown error"])
-        raise HTTPException(status_code=400, detail=detail)
-    # Success
     traces = [_round_trace(t) for t in (result.traces or [])]
     return NL2SQLResponse(
         ambiguous=False,
@@ -217,3 +289,33 @@ def nl2sql_handler(request: NL2SQLRequest):
         rationale=result.rationale,
         traces=traces,
     )

 from adapters.db.postgres_adapter import PostgresAdapter
 import os
+from pathlib import Path
 import time
+import json
 import uuid
 from typing import Union, Optional, Dict
     "DEFAULT_SQLITE_DB", "data/chinook.db"
 )  # keep your current default
+# -------------------------------
+# Path to persist db_id → file map
+# -------------------------------
+_DB_MAP_PATH = Path("data/uploads/db_map.json")
+_DB_MAP_PATH.parent.mkdir(parents=True, exist_ok=True)
+UPLOAD_DIR = Path("data/uploads")
+UPLOAD_DIR.mkdir(parents=True, exist_ok=True)  # ensure folder exists
+DEFAULT_SQLITE_PATH = "data/Chinook_Sqlite.sqlite"
+def _save_db_map():
+    """Persist the in-memory DB map to disk as JSON."""
+    try:
+        with open(_DB_MAP_PATH, "w") as f:
+            json.dump(_DB_MAP, f)
+    except Exception as e:
+        print(f"⚠️ Failed to save DB map: {e}")
+def _load_db_map():
+    """Load the DB map from disk if it exists (called on startup)."""
+    global _DB_MAP
+    if _DB_MAP_PATH.exists():
+        try:
+            with open(_DB_MAP_PATH, "r") as f:
+                data = json.load(f)
+            if isinstance(data, dict):
+                _DB_MAP.update(data)
+                print(f"📂 Restored {_DB_MAP_PATH} with {len(_DB_MAP)} entries.")
+        except Exception as e:
+            print(f"⚠️ Failed to load DB map: {e}")
 def _cleanup_db_map() -> None:
     """Remove expired uploaded DB files (best-effort)."""
     return DEFAULT_SQLITE_DB
+def _select_adapter(db_id: str | None):
+    mode = os.getenv("DB_MODE", "sqlite").lower()
+    if mode == "postgres":
+        dsn = os.environ.get("POSTGRES_DSN")
+        if not dsn:
+            raise HTTPException(status_code=500, detail="POSTGRES_DSN env is missing")
+        return PostgresAdapter(dsn)
+    # sqlite mode
+    if db_id:
+        _cleanup_db_map()
+        db_path = None
+        # first check runtime map
+        if db_id in _DB_MAP:
+            db_path = _DB_MAP[db_id].get("path")
+        # fallback: check /tmp or uploads
+        if not db_path or not os.path.exists(db_path):
+            fallback_tmp = os.path.join(_DB_UPLOAD_DIR, f"{db_id}.sqlite")
+            fallback_uploads = UPLOAD_DIR / f"{db_id}.sqlite"
+            for candidate in (fallback_tmp, fallback_uploads):
+                if os.path.exists(candidate):
+                    db_path = str(candidate)
+                    break
+        if not db_path or not os.path.exists(db_path):
             raise HTTPException(
+                status_code=400, detail="invalid db_id (file not found)"
             )
+        return SQLiteAdapter(str(db_path))
+    # fallback to default Chinook
+    if not Path(DEFAULT_SQLITE_PATH).exists():
+        raise HTTPException(status_code=500, detail="default DB not found")
+    return SQLiteAdapter(DEFAULT_SQLITE_PATH)
 # -------------------------------
         raise HTTPException(status_code=500, detail=f"Failed to store DB: {e}")
     _DB_MAP[db_id] = {"path": out_path, "ts": time.time()}
+    _save_db_map()
     return {"db_id": db_id}
 def nl2sql_handler(request: NL2SQLRequest):
     """
     Handle NL → SQL pipeline execution.
+    If `db_id` is provided, switch DB adapter for this call.
+    If `schema_preview` is missing, derive it from the selected adapter when possible.
     """
+    # 1) Select adapter based on db_id (if any)
     db_id = getattr(request, "db_id", None)  # Optional[str]
     adapter = _select_adapter(db_id)
     pipeline = _build_pipeline(adapter)
+    # 2) Resolve schema_preview (optional in request)
+    provided_preview = getattr(request, "schema_preview", None)
+    schema_preview = (
+        provided_preview
+        if provided_preview not in ("", None)
+        else _derive_schema_preview(adapter)
     )
+    # 3) Run pipeline
+    try:
+        result = pipeline.run(
+            user_query=request.query,  # assumes NL2SQLRequest has `query`
+            schema_preview=schema_preview,  # may be empty string if adapter can't derive
+        )
+    except Exception as exc:
+        # Hard failure in pipeline itself
+        raise HTTPException(status_code=500, detail=f"Pipeline crash: {exc!s}")
+    # 4) Type check
     if not isinstance(result, FinalResult):
         raise HTTPException(status_code=500, detail="Pipeline returned unexpected type")
+    # 5) Ambiguity → ask for clarification
     if result.ambiguous and result.questions:
         return ClarifyResponse(ambiguous=True, questions=result.questions)
+    # 6) Soft errors → bubble up details with 400
     if not result.ok or result.error:
+        print("❌ Pipeline failure dump:")
+        print("  ok:", result.ok)
+        print("  error:", result.error)
+        print("  details:", result.details)
+        print("  traces:", result.traces)
+        raise HTTPException(
+            status_code=400,
+            detail="; ".join(result.details or []) or (result.error or "Unknown error"),
+        )
+    # 7) Success
     traces = [_round_trace(t) for t in (result.traces or [])]
     return NL2SQLResponse(
         ambiguous=False,
         rationale=result.rationale,
         traces=traces,
     )
+def _derive_schema_preview(adapter) -> str:
+    """
+    Build a strict, exact-cased schema preview for the LLM.
+    Works for SQLite adapters by querying sqlite_master / pragma table_info.
+    """
+    import sqlite3
+    import os
+    db_path = getattr(adapter, "db_path", None) or getattr(adapter, "path", None)
+    if not db_path or not os.path.exists(db_path):
+        return ""
+    try:
+        conn = sqlite3.connect(db_path)
+        cur = conn.cursor()
+        tables = cur.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
+        ).fetchall()
+        lines = []
+        for (tname,) in tables:
+            cols = cur.execute(f"PRAGMA table_info('{tname}')").fetchall()
+            # sqlite: pragma columns → (cid, name, type, notnull, dflt_value, pk)
+            colnames = [c[1] for c in cols]
+            lines.append(f"{tname}({', '.join(colnames)})")
+        conn.close()
+        return "\n".join(lines)
+    except Exception:
+        return ""

app/schemas.py CHANGED Viewed

@@ -4,8 +4,11 @@ from typing import List, Optional, Any, Dict, Mapping, Sequence
 class NL2SQLRequest(BaseModel):
     query: str
-    schema_preview: str
-    db_name: Optional[str] = "default"
 class TraceModel(BaseModel):

 class NL2SQLRequest(BaseModel):
     query: str
+    db_id: Optional[str] = None
+    schema_preview: Optional[str] = None
+    class Config:
+        extra = "ignore"
 class TraceModel(BaseModel):

nl2sql/pipeline.py CHANGED Viewed

@@ -31,7 +31,6 @@ class Pipeline:
     """
     NL2SQL Copilot pipeline.
     Stages return StageResult; final result is a type-safe FinalResult.
-    Adapters (e.g. FastAPI) can serialize with dataclasses.asdict().
     """
     def __init__(
@@ -71,9 +70,7 @@ class Pipeline:
             r = fn(**kwargs)
             if isinstance(r, StageResult):
                 return r
-            else:
-                # Normalize non-StageResult returns
-                return StageResult(ok=True, data=r, trace=None)
         except Exception as e:
             tb = traceback.format_exc()
             return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
@@ -92,7 +89,7 @@ class Pipeline:
         rationale: Optional[str] = None
         verified: Optional[bool] = None
-        # --- 1) ambiguity detection
         try:
             questions = self.detector.detect(user_query, schema_preview)
             if questions:
@@ -120,7 +117,7 @@ class Pipeline:
                 traces=[],
             )
-        # --- 2) planner
         r_plan = self._safe_stage(
             self.planner.run, user_query=user_query, schema_preview=schema_preview
         )
@@ -138,7 +135,7 @@ class Pipeline:
                 traces=traces,
             )
-        # --- 3) generator
         r_gen = self._safe_stage(
             self.generator.run,
             user_query=user_query,
@@ -159,10 +156,11 @@ class Pipeline:
                 verified=None,
                 traces=traces,
             )
         sql = (r_gen.data or {}).get("sql")
         rationale = (r_gen.data or {}).get("rationale")
-        # --- 4) safety
         r_safe = self._safe_stage(self.safety.run, sql=sql)
         traces.extend(self._trace_list(r_safe))
         if not r_safe.ok:
@@ -178,7 +176,7 @@ class Pipeline:
                 traces=traces,
             )
-        # --- 5) executor
         r_exec = self._safe_stage(
             self.executor.run, sql=(r_safe.data or {}).get("sql", sql)
         )
@@ -186,14 +184,14 @@ class Pipeline:
         if not r_exec.ok:
             details.extend(r_exec.error or [])
-        # --- 6) verifier
         r_ver = self._safe_stage(
             self.verifier.run, sql=sql, exec_result=(r_exec.data or {})
         )
         traces.extend(self._trace_list(r_ver))
-        verified = bool(r_ver.ok)
-        # --- 7) repair loop if verification failed
         if not verified:
             for _attempt in range(2):
                 r_fix = self._safe_stage(
@@ -205,8 +203,8 @@ class Pipeline:
                 traces.extend(self._trace_list(r_fix))
                 if not r_fix.ok:
                     break
-                sql = (r_fix.data or {}).get("sql")
                 r_safe = self._safe_stage(self.safety.run, sql=sql)
                 traces.extend(self._trace_list(r_safe))
                 if not r_safe.ok:
@@ -225,14 +223,45 @@ class Pipeline:
                     self.verifier.run, sql=sql, exec_result=(r_exec.data or {})
                 )
                 traces.extend(self._trace_list(r_ver))
-                verified = bool(r_ver.ok)
                 if verified:
                     break
         return FinalResult(
-            ok=bool(verified) and not details,
             ambiguous=False,
-            error=bool(details) and not bool(verified),
             details=details or None,
             sql=sql,
             rationale=rationale,

     """
     NL2SQL Copilot pipeline.
     Stages return StageResult; final result is a type-safe FinalResult.
     """
     def __init__(
             r = fn(**kwargs)
             if isinstance(r, StageResult):
                 return r
+            return StageResult(ok=True, data=r, trace=None)
         except Exception as e:
             tb = traceback.format_exc()
             return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
         rationale: Optional[str] = None
         verified: Optional[bool] = None
+        # --- 1) ambiguity detection ---
         try:
             questions = self.detector.detect(user_query, schema_preview)
             if questions:
                 traces=[],
             )
+        # --- 2) planner ---
         r_plan = self._safe_stage(
             self.planner.run, user_query=user_query, schema_preview=schema_preview
         )
                 traces=traces,
             )
+        # --- 3) generator ---
         r_gen = self._safe_stage(
             self.generator.run,
             user_query=user_query,
                 verified=None,
                 traces=traces,
             )
         sql = (r_gen.data or {}).get("sql")
         rationale = (r_gen.data or {}).get("rationale")
+        # --- 4) safety ---
         r_safe = self._safe_stage(self.safety.run, sql=sql)
         traces.extend(self._trace_list(r_safe))
         if not r_safe.ok:
                 traces=traces,
             )
+        # --- 5) executor ---
         r_exec = self._safe_stage(
             self.executor.run, sql=(r_safe.data or {}).get("sql", sql)
         )
         if not r_exec.ok:
             details.extend(r_exec.error or [])
+        # --- 6) verifier ---
         r_ver = self._safe_stage(
             self.verifier.run, sql=sql, exec_result=(r_exec.data or {})
         )
         traces.extend(self._trace_list(r_ver))
+        verified = bool(r_ver.data and r_ver.data.get("verified")) or r_ver.ok
+        # --- 7) repair loop if verification failed ---
         if not verified:
             for _attempt in range(2):
                 r_fix = self._safe_stage(
                 traces.extend(self._trace_list(r_fix))
                 if not r_fix.ok:
                     break
+                sql = (r_fix.data or {}).get("sql")
                 r_safe = self._safe_stage(self.safety.run, sql=sql)
                 traces.extend(self._trace_list(r_safe))
                 if not r_safe.ok:
                     self.verifier.run, sql=sql, exec_result=(r_exec.data or {})
                 )
                 traces.extend(self._trace_list(r_ver))
+                verified = bool(r_ver.data and r_ver.data.get("verified")) or r_ver.ok
                 if verified:
                     break
+        # --- 8) fallback: verifier silent but executor succeeded ---
+        if (verified is None or not verified) and not details:
+            any_exec = any(
+                t.get("stage") == "executor" and t.get("notes", {}).get("row_count")
+                for t in traces
+            )
+            if any_exec:
+                traces.append(
+                    {
+                        "stage": "pipeline",
+                        "notes": {
+                            "auto_fix": "verified=True (executor succeeded, verifier silent)"
+                        },
+                        "duration_ms": 0.0,
+                    }
+                )
+                verified = True
+        # --- 9) finalize result ---
+        has_errors = bool(details)
+        ok = bool(verified) and not has_errors
+        err = has_errors and not bool(verified)
+        traces.append(
+            {
+                "stage": "pipeline",
+                "notes": {"final_verified": verified, "details_len": len(details)},
+                "duration_ms": 0.0,
+            }
+        )
         return FinalResult(
+            ok=ok,
             ambiguous=False,
+            error=err,
             details=details or None,
             sql=sql,
             rationale=rationale,

nl2sql/safety.py CHANGED Viewed

@@ -19,10 +19,20 @@ _FORBIDDEN = re.compile(
 # allow: SELECT ...   or   WITH <cte...> SELECT ...
 _ALLOW_SELECT = re.compile(r"^(?:WITH\b.*?\)\s*)?SELECT\b", re.IGNORECASE | re.DOTALL)
-def _strip_comments(s: str) -> str:
     s = _COMMENT_BLOCK.sub(" ", s)
     s = _COMMENT_LINE.sub(" ", s)
     return s
@@ -33,8 +43,13 @@ def _mask_strings(s: str) -> str:
 def _split_statements(s: str) -> list[str]:
     parts = [p.strip() for p in s.split(";")]
-    return [p for p in parts if p]
 class Safety:
@@ -43,7 +58,9 @@ class Safety:
     def check(self, sql: str) -> StageResult:
         t0 = time.perf_counter()
         print("🧩 SQL candidate:", sql)
-        s = _strip_comments(sql)
         s = _mask_strings(s).strip()
         stmts = _split_statements(s)
@@ -79,8 +96,8 @@ class Safety:
         return StageResult(
             ok=True,
             data={
-                "sql": sql.strip(),
-                "rationale": "Statement validated as SELECT-only (strings/comments ignored).",
             },
             trace=StageTrace(
                 stage=self.name, duration_ms=(time.perf_counter() - t0) * 1000

 # allow: SELECT ...   or   WITH <cte...> SELECT ...
 _ALLOW_SELECT = re.compile(r"^(?:WITH\b.*?\)\s*)?SELECT\b", re.IGNORECASE | re.DOTALL)
+# --- New cleanup helpers ---
+_FENCE_SQL = re.compile(r"```sql", re.IGNORECASE)
+_FENCE_ANY = re.compile(r"```")
+def _sanitize_sql(sql: str) -> str:
+    """Remove markdown fences, comments, and surrounding junk."""
+    s = _FENCE_SQL.sub("", sql)
+    s = _FENCE_ANY.sub("", s)
     s = _COMMENT_BLOCK.sub(" ", s)
     s = _COMMENT_LINE.sub(" ", s)
+    s = s.strip()
+    # remove trailing semicolon safely
+    s = s.rstrip(";").strip()
     return s
 def _split_statements(s: str) -> list[str]:
+    """
+    Split only if there are real multiple statements,
+    ignoring harmless trailing semicolons or markdown.
+    """
     parts = [p.strip() for p in s.split(";")]
+    parts = [p for p in parts if p]
+    return parts
 class Safety:
     def check(self, sql: str) -> StageResult:
         t0 = time.perf_counter()
         print("🧩 SQL candidate:", sql)
+        # --- sanitize first ---
+        s = _sanitize_sql(sql)
         s = _mask_strings(s).strip()
         stmts = _split_statements(s)
         return StageResult(
             ok=True,
             data={
+                "sql": body,
+                "rationale": "Statement validated as SELECT-only (strings/comments/markdown ignored).",
             },
             trace=StageTrace(
                 stage=self.name, duration_ms=(time.perf_counter() - t0) * 1000

nl2sql/verifier.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import sqlglot
 from sqlglot import expressions as exp
 from nl2sql.types import StageResult, StageTrace
@@ -6,18 +7,32 @@ from nl2sql.types import StageResult, StageTrace
 class Verifier:
     name = "verifier"
-    def run(self, sql: str, exec_result: StageResult) -> StageResult:
-        if not exec_result.ok:
             return StageResult(
                 ok=False,
                 data=None,
                 trace=StageTrace(
-                    stage=self.name, duration_ms=0, notes={"reason": "execution_error"}
                 ),
-                error=exec_result.error,
             )
-        # Rule 1: check SELECT / GROUP consistency
         issues = []
         try:
             tree = sqlglot.parse_one(sql)
@@ -25,21 +40,35 @@ class Verifier:
                 group = tree.args.get("group")
                 aggs = [a for a in tree.find_all(exp.AggFunc)]
                 if aggs and not group:
-                    issues.append("Aggregation without GROUP BY.")
         except Exception as e:
-            issues.append(f"Parse error during verification: {e}")
         if issues:
             return StageResult(
                 ok=False,
-                data=None,
                 trace=StageTrace(
-                    stage=self.name, duration_ms=0, notes={"issues": issues}
                 ),
-                error=issues,
             )
         return StageResult(
             ok=True,
             data={"verified": True},
-            trace=StageTrace(stage=self.name, duration_ms=0),
         )

+import time
 import sqlglot
 from sqlglot import expressions as exp
 from nl2sql.types import StageResult, StageTrace
 class Verifier:
     name = "verifier"
+    def run(self, sql: str, exec_result: dict | None) -> StageResult:
+        t0 = time.perf_counter()
+        # Defensive: check executor result validity
+        if not exec_result or not isinstance(exec_result, dict):
             return StageResult(
                 ok=False,
+                error=["invalid or missing exec_result"],
                 data=None,
                 trace=StageTrace(
+                    stage=self.name, duration_ms=(time.perf_counter() - t0) * 1000
                 ),
             )
+        # If executor had rows and no error, consider verified early
+        rows = exec_result.get("rows")
+        if rows is not None and len(rows) > 0:
+            return StageResult(
+                ok=True,
+                data={"verified": True, "rows_checked": len(rows)},
+                trace=StageTrace(
+                    stage=self.name, duration_ms=(time.perf_counter() - t0) * 1000
+                ),
+            )
+        # Optional deeper check using SQL structure
         issues = []
         try:
             tree = sqlglot.parse_one(sql)
                 group = tree.args.get("group")
                 aggs = [a for a in tree.find_all(exp.AggFunc)]
                 if aggs and not group:
+                    select_cols = [
+                        c for c in tree.expressions if not isinstance(c, exp.AggFunc)
+                    ]
+                    if select_cols:
+                        issues.append(
+                            "Non-aggregated columns with aggregation but no GROUP BY."
+                        )
         except Exception as e:
+            # parsing failed → skip structural verification gracefully
+            return StageResult(
+                ok=True,
+                data={"verified": True, "note": f"Skipped parse: {e}"},
+                trace=StageTrace(
+                    stage=self.name, duration_ms=(time.perf_counter() - t0) * 1000
+                ),
+            )
+        dur = (time.perf_counter() - t0) * 1000
         if issues:
             return StageResult(
                 ok=False,
+                error=issues,
                 trace=StageTrace(
+                    stage=self.name, duration_ms=dur, notes={"issues": issues}
                 ),
             )
         return StageResult(
             ok=True,
             data={"verified": True},
+            trace=StageTrace(stage=self.name, duration_ms=dur),
         )