Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Oct 25, 2025

Commit

570f7bd

1 Parent(s): 5eeca35

init: NL2SQL Copilot base with API and Dockerfile

Browse files

Files changed (43) hide show

.github/workflows/ci.yml +59 -0
Dockerfile +40 -0
adapters/db/base.py +13 -0
adapters/db/postgres_adapter.py +44 -0
adapters/db/sqlite_adapter.py +32 -0
adapters/llm/base.py +16 -0
adapters/llm/openai_provider.py +113 -0
app.py +0 -235
app/__init__.py +0 -0
app/main.py +29 -0
app/routers/__init__.py +0 -0
app/routers/nl2sql.py +83 -0
app/schemas.py +29 -0
benchmarks/results/demo.jsonl +3 -0
benchmarks/run.py +143 -0
docker-compose.yml +40 -0
infra/migrate.sql +8 -0
logs/spider_eval/dev_gold_1760430884.txt +0 -10
logs/spider_eval/dev_metrics_1760430884.json +0 -15
logs/spider_eval/dev_pred_1760430884.txt +0 -10
logs/spider_eval/dev_results_1760430884.jsonl +0 -11
nl2sql/__init__.py +0 -0
nl2sql/ambiguity_detector.py +16 -0
nl2sql/executor.py +21 -0
nl2sql/generator.py +49 -0
nl2sql/pipeline.py +165 -0
nl2sql/planner.py +16 -0
nl2sql/repair.py +29 -0
nl2sql/safety.py +75 -0
nl2sql/stubs.py +31 -0
nl2sql/types.py +19 -0
nl2sql/verifier.py +33 -0
requirements.txt +11 -8
tests/conftest.py +7 -0
tests/test_ambiguity.py +19 -0
tests/test_executor.py +16 -0
tests/test_generator.py +96 -0
tests/test_nl2sql_router.py +95 -0
tests/test_openai_provider.py +87 -0
tests/test_pipeline_integration.py +142 -0
tests/test_safety.py +103 -0
tests/test_stage_types.py +18 -0
ui/benchmark_app.py +45 -0

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,59 @@

+name: CI
+on:
+  push:
+    branches: [ main, develop ]
+  pull_request:
+jobs:
+  build-test:
+    runs-on: ubuntu-latest
+    env:
+      PIP_NO_CACHE_DIR: 1
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Lint (ruff)
+        run: ruff check .
+      - name: Type check (mypy)
+        run: mypy .
+      - name: Run tests
+        run: pytest -q
+  docker-build:
+    needs: build-test
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/main'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Login to GHCR
+        if: secrets.GHCR_TOKEN != ''
+        run: echo "${{ secrets.GHCR_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
+      - name: Build Docker image
+        run: |
+          IMAGE=ghcr.io/${{ github.repository_owner }}/nl2sql-copilot:${{ github.sha }}
+          docker build -t $IMAGE .
+          echo "IMAGE=$IMAGE" >> $GITHUB_ENV
+      - name: Push image
+        if: secrets.GHCR_TOKEN != ''
+        run: docker push $IMAGE

Dockerfile ADDED Viewed

	@@ -0,0 +1,40 @@

+# ---------- Stage 1: Build wheels ----------
+FROM python:3.12-slim AS builder
+# Set working directory for the build stage
+WORKDIR /build
+# Install system dependencies required to compile some Python packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential libpq-dev && \
+    rm -rf /var/lib/apt/lists/*
+# Copy only requirements first (so Docker caching works efficiently)
+COPY requirements.txt .
+# Build all dependencies as wheel files inside /wheels
+RUN pip install --upgrade pip && \
+    pip wheel --wheel-dir /wheels -r requirements.txt
+# ---------- Stage 2: Runtime image ----------
+FROM python:3.12-slim AS runtime
+# Set working directory for the application
+WORKDIR /app
+# Copy prebuilt wheels from the builder stage
+COPY --from=builder /wheels /wheels
+# Install dependencies from prebuilt wheels (no need to compile again)
+COPY requirements.txt .
+RUN pip install --no-cache-dir --find-links=/wheels -r requirements.txt
+# Copy the actual application code
+COPY . .
+# Expose the FastAPI port
+EXPOSE 8000
+# Start FastAPI with Uvicorn
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers"]

adapters/db/base.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from typing import Tuple, List, Dict, Any, Protocol
+from typing import List, Tuple, Any
+class DBAdapter(Protocol):
+    """Abstract database adapter for read-only queries."""
+    name: str
+    dialect: str
+    def preview_schema(self, limit_per_table: int = 0) -> str:
+        """Generate a readable summary of the database schema with optional sample rows per table."""
+    def execute(self, sql: str) -> Tuple[List[Tuple[Any, ...]], List[str]]:
+        """Execute a SELECT query and return (rows, columns)."""

adapters/db/postgres_adapter.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import psycopg
+from typing import Any, List, Tuple
+from adapters.db.base import DBAdapter
+class PostgresAdapter(DBAdapter):
+    name = "postgres"
+    dialect = "postgres"
+    def __init__(self, dsn: str):
+        """
+        DSN example:
+        "dbname=demo user=postgres password=postgres host=localhost port=5432"
+        """
+        self.dsn = dsn
+    def preview_schema(self, limit_per_table: int = 0) -> str:
+        with psycopg.connect(self.dsn) as conn:
+            cur = conn.cursor()
+            cur.execute("""
+                SELECT table_name
+                FROM information_schema.tables
+                WHERE table_schema = 'public';
+            """)
+            tables = [t[0] for t in cur.fetchall()]
+            lines = []
+            for t in tables:
+                cur.execute(f"""
+                    SELECT column_name, data_type
+                    FROM information_schema.columns
+                    WHERE table_name = %s;
+                """, (t,))
+                cols = [f"{c[0]}:{c[1]}" for c in cur.fetchall()]
+                lines.append(f"- {t} ({', '.join(cols)})")
+            return "\n".join(lines)
+    def execute(self, sql: str) -> Tuple[List[Tuple[Any, ...]], List[str]]:
+        if not sql.strip().lower().startswith("select"):
+            raise ValueError("Only SELECT statements are allowed.")
+        with psycopg.connect(self.dsn) as conn:
+            cur = conn.cursor()
+            cur.execute(sql)
+            rows = cur.fetchall()
+            cols = [desc[0] for desc in cur.description]
+            return rows, cols

adapters/db/sqlite_adapter.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import sqlite3
+from typing import List, Tuple, Any
+from adapters.db.base import DBAdapter
+class SQLiteAdapter(DBAdapter):
+    name = "sqlite"
+    dialect = "sqlite"
+    def __init__(self, path: str):
+        self.path = path
+    def preview_schema(self, limit_per_table: int = 0) -> str:
+        with sqlite3.connect(self.path, uri=True) as conn:
+            cur = conn.cursor()
+            cur.execute("PRAGMA foreign_keys = ON")
+            tables = [t[0] for t in cur.fetchall()]
+            lines = []
+            for t in tables:
+                cur.execute(f"PRAGMA table_info({t});")
+                cols = [f"{c[1]}:{c[2]}" for c in cur.fetchall()]
+                lines.append(f"- {t} ({', '.join(cols)})")
+            return "\n".join(lines)
+    def execute(self, sql: str) -> Tuple[List[Tuple[Any, ...]], List[str]]:
+        # enforce read-only connection
+        uri = f"file:{self.path}?mode=ro&uri=true"
+        with sqlite3.connect(uri, uri=True, timeout=3) as conn:
+            cur = conn.cursor()
+            cur.execute(sql)
+            rows = cur.fetchall()
+            cols = [desc[0] for desc in cur.description]
+            return rows, cols

adapters/llm/base.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# adapters/llm/base.py
+from __future__ import annotations
+from typing import Tuple, List, Dict, Any, Protocol
+class LLMProvider(Protocol):
+    provider_id: str
+    def plan(self, *, user_query: str, schema_preview: str) -> Tuple[str, int, int, float]:
+        """Return (plan_text, token_in, token_out, cost_usd)."""
+    def generate_sql(self, *, user_query: str, schema_preview: str, plan_text: str,
+                     clarify_answers: Dict[str, Any] | None = None) -> Tuple[str, str, int, int, float]:
+        """Return (sql, rationale, token_in, token_out, cost_usd)."""
+    def repair(self, *, sql: str, error_msg: str, schema_preview: str) -> Tuple[str, int, int, float]:
+        """Return (patched_sql, token_in, token_out, cost_usd)."""

adapters/llm/openai_provider.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from __future__ import annotations
+import os
+from typing import Tuple, Dict, Any, List
+import json
+from adapters.llm.base import LLMProvider
+from openai import OpenAI
+# NOTE: Read keys/base URL from env. Do NOT pass base_url in constructors.
+#  - OPENAI_API_KEY   (required)
+#  - OPENAI_BASE_URL  (optional; defaults to OpenAI public)
+#  - OPENAI_MODEL_ID  (e.g., "gpt-4o-mini")
+class OpenAIProvider(LLMProvider):
+    provider_id = "openai"
+    def __init__(self) -> None:
+        self.client = OpenAI(
+            api_key=os.environ["OPENAI_API_KEY"],
+            base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
+        )
+        self.model = os.getenv("OPENAI_MODEL_ID", "gpt-4o-mini")
+    def plan(self, *, user_query, schema_preview):
+        completion = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You create SQL query plans."},
+                {"role": "user", "content": f"Query: {user_query}\nSchema:\n{schema_preview}"}
+            ],
+            temperature=0
+        )
+        msg = completion.choices[0].message.content
+        usage = completion.usage
+        return msg, usage.prompt_tokens, usage.completion_tokens, self._estimate_cost(usage)
+    def generate_sql(self, *, user_query, schema_preview, plan_text, clarify_answers=None):
+        prompt = f"""
+        You are a precise SQL generator.
+        Return ONLY valid JSON with two keys: "sql" and "rationale".
+        Do not include any markdown, backticks, or extra text.
+        Example:
+        {{
+          "sql": "SELECT * FROM singer;",
+          "rationale": "The user requested to list all singers."
+        }}
+        Now generate JSON for this input:
+        User query: {user_query}
+        Schema preview:
+        {schema_preview}
+        Plan: {plan_text}
+        Clarifications: {clarify_answers}
+        """
+        completion = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You convert natural language to SQL."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0
+        )
+        content = completion.choices[0].message.content.strip()
+        usage = completion.usage  # ← لازم داریم
+        t_in = usage.prompt_tokens if usage else None
+        t_out = usage.completion_tokens if usage else None
+        cost = self._estimate_cost(usage) if usage else None
+        # Robust JSON parse (with fallback to substring)
+        try:
+            parsed = json.loads(content)
+        except json.JSONDecodeError:
+            start = content.find("{")
+            end = content.rfind("}")
+            if start != -1 and end != -1:
+                try:
+                    parsed = json.loads(content[start:end + 1])
+                except Exception:
+                    raise ValueError(f"Invalid LLM JSON output: {content[:200]}")
+            else:
+                raise ValueError(f"Invalid LLM JSON output: {content[:200]}")
+        sql = (parsed.get("sql") or "").strip()
+        rationale = parsed.get("rationale") or ""
+        if not sql:
+            raise ValueError("LLM returned empty 'sql'")
+        # IMPORTANT: return the expected 5-tuple
+        return sql, rationale, t_in, t_out, cost
+    def repair(self, *, sql, error_msg, schema_preview):
+        completion = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You fix SQL queries keeping them SELECT-only."},
+                {"role": "user", "content": f"SQL:\n{sql}\nError:\n{error_msg}\nSchema:\n{schema_preview}"}
+            ],
+            temperature=0
+        )
+        msg = completion.choices[0].message.content
+        usage = completion.usage
+        return msg, usage.prompt_tokens, usage.completion_tokens, self._estimate_cost(usage)
+    def _estimate_cost(self, usage):
+        # Rough estimation example — can be refined with official token pricing
+        total = usage.prompt_tokens + usage.completion_tokens
+        return total * 0.000001

app.py DELETED Viewed

@@ -1,235 +0,0 @@
-from config import (
-    LLM_MODEL,
-    LLM_TEMPERATURE,
-    FORBIDDEN_KEYWORDS,
-    FORBIDDEN_TABLES
-)
-import os
-import sqlite3
-import json
-import re
-from typing import Optional, Tuple, List
-import gradio as gr
-import sqlglot
-from sqlglot import exp
-from langchain_openai import ChatOpenAI
-from langchain_community.utilities import SQLDatabase
-from langchain.chains import create_sql_query_chain
-from langchain.prompts import ChatPromptTemplate
-def get_readonly_sqlite_url(db_path: str) -> str:
-    return f"file:{db_path}?mode=ro&uri=true"
-def get_schema_preview(db_path: str, limit_per_table: int = 0) -> str:
-    uri = get_readonly_sqlite_url(db_path)
-    with sqlite3.connect(uri, uri=True, timeout=3) as conn:
-        conn.row_factory = sqlite3.Row
-        cur = conn.cursor()
-        cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
-        tables = [r["name"] for r in cur.fetchall()]
-        lines = []
-        for t in tables:
-            # skip SQLite internals
-            if t in FORBIDDEN_TABLES:
-                continue
-            cur.execute(f"PRAGMA table_info({t});")
-            cols = cur.fetchall()
-            col_line = ", ".join([f"{c['name']}:{c['type']}" for c in cols])
-            lines.append(f"- {t} ({col_line})")
-            if limit_per_table > 0:
-                try:
-                    cur.execute(f"SELECT * FROM {t} LIMIT {limit_per_table};")
-                    sample = cur.fetchall()
-                    if sample:
-                        lines.append(f"  sample rows: {len(sample)}")
-                except Exception:
-                    pass
-        if not lines:
-            return "(no user tables found)"
-        return "\n".join(lines)
-def validate_sql_safe(sql: str) -> Tuple[bool, str]:
-    if sql.count(";") > 0:
-        if sql.strip().endswith(";"):
-            if sql.strip()[:-1].count(";") > 0:
-                return False, "Multiple statements are not allowed."
-        else:
-            return False, "Multiple statements are not allowed."
-    upper = re.sub(r"\s+", " ", sql).strip()
-    for kw in FORBIDDEN_KEYWORDS:
-        if re.search(rf"\b{kw}\b", upper):
-            return False, f"Keyword '{kw}' is not allowed."
-    try:
-        parsed = sqlglot.parse(sql, read='sqlite')
-    except Exception as e:
-        return False, f"SQL parse error: {e}"
-    if not parsed or len(parsed) != 1:
-        return False, "Exactly one SQL statement is allowed."
-    stmt = parsed[0]
-    if not isinstance(stmt, exp.Select):
-        return False, "Only SELECT statements are allowed."
-    for table in stmt.find_all(exp.Table):
-        table_name = table.name.lower() if table.name else ""
-        if table_name in FORBIDDEN_TABLES:
-            return False, f"Access to {table_name} is not allowed."
-    return True, "OK"
-def execute_select(db_path: str, sql: str, max_rows: int = 1000, timeout: float = 5.0) -> Tuple[list[str], List[List]]:
-    uri = get_readonly_sqlite_url(db_path)
-    if not re.search(r"\bLIMIT\b", sql, re.IGNORECASE):
-        sql = f"{sql.rstrip(';')} LIMIT {max_rows}"
-    with sqlite3.connect(uri, uri=True, timeout=timeout) as conn:
-        conn.row_factory = sqlite3.Row
-        cur = conn.cursor()
-        cur.execute(sql)
-        rows = cur.fetchall()
-        if rows:
-            cols = rows[0].keys()
-            data = [list(r) for r in rows]
-            return list(cols), data
-        else:
-            return [], []
-custom_prompt = ChatPromptTemplate.from_template("""
-Given the following question, return ONLY a valid SQL query in JSON form.
-Question: {input}
-Database schema: {table_info}
-You may sample/preview at most {top_k} rows if you need examples.
-Respond in this exact JSON format:
-{{
-  "sql": "<SQL_QUERY_HERE>"
-}}
-""")
-def make_sql_chain(sql_db: SQLDatabase):
-    assert hasattr(sql_db, "get_table_info"), "Expected LangChain SQLDatabase"
-    llm = ChatOpenAI(model=LLM_MODEL, temperature=LLM_TEMPERATURE)
-    chain = create_sql_query_chain(llm, sql_db, prompt=custom_prompt, k=20)
-    return chain
-def on_upload_database(db_file, state):
-    if db_file is None:
-        return state, "No file provided.", "(no schema)"
-    path = db_file.name
-    sql_db = SQLDatabase.from_uri(f"sqlite:///{path}")
-    schema_text = get_schema_preview(path, limit_per_table=0)
-    chain = make_sql_chain(sql_db)
-    new_state = {
-        "db_path": path,
-        "sql_db": sql_db,
-        "schema_text": schema_text,
-        "chain": chain,
-    }
-    return new_state, f"Database '{os.path.basename(path)}' uploaded successfully.", schema_text
-def extract_sql_safe(output_text: str) -> str:
-    try:
-        obj = json.loads(output_text)
-        if isinstance(obj, dict) and "sql" in obj:
-            return obj["sql"].strip()
-    except Exception:
-        pass
-    m = re.search(r"```sql\s*(.*?)\s*```", output_text, re.DOTALL | re.IGNORECASE)
-    if m:
-        return m.group(1).strip()
-    return output_text.strip()
-def on_generate_query(question , max_rows, state):
-    if not state or not state.get("db_path") or not state.get("chain"):
-        return "Please upload a database first.", "", ""
-    if not question or not question.strip():
-        return "Please enter a question.", "", ""
-    try:
-        generated_sql = state["chain"].invoke({"question": question})
-        sql = extract_sql_safe(str(generated_sql))
-        ok, msg = validate_sql_safe(sql)
-        if not ok:
-            return f"Blocked SQL: {msg}", sql, ""
-        cols, rows = execute_select(state["db_path"], sql, max_rows=max_rows)
-        if not cols:
-            return f"No rows returned.", sql, "[]"
-        sample = [dict(zip(cols, r)) for r in rows[:50]]
-        return f"Returned {len(rows)} row(s). Showing up to 50.", sql, json.dumps(sample, indent=2)
-    except Exception as e:
-        return f"Error: {e}", "", ""
-with gr.Blocks(title="nl2sql-copilot-prototype (safe)") as demo:
-    gr.Markdown("# nl2sql-copilot-prototype (Sqlite, safe)")
-    gr.Markdown(
-        "Upload a **SQLite** file, ask a question in natural language, "
-        "and I will: (1) generate SQL, (2) validate it (SELECT-only), (3) execute read-only, "
-        "and (4) show you the results."
-    )
-    state = gr.State({"db_path": None, "sql_db": None, "schema_text": "", "chain": None})
-    with gr.Row():
-        db_file = gr.File(label="Upload SQlite Database", file_types=[".sqlite", ".db"])
-        upload_status = gr.Textbox(label="upload Status", interactive=False)
-    schema_box = gr.Accordion("Database schema (preview)", open=False)
-    with schema_box:
-        schema_md = gr.Markdown("(no schema)")
-    gr.Markdown("---")
-    with gr.Row():
-        question = gr.Textbox(label="Your question", placeholder="e.g., Top 10 tracks by total sales")
-    with gr.Row():
-        max_row= gr.Slider(10, 5000, value=1000, step=10, label="Max rows")
-    with gr.Row():
-        run_btn = gr.Button("Generate & Run SQL", variant="primary")
-    with gr.Row():
-        status_out = gr.Textbox(label="Status")
-    with gr.Row():
-        sql_out = gr.Code(label="Generated SQL (validated)")
-    with gr.Row():
-        result_out = gr.Code(label="Result (JSON sample)")
-    db_file.change(
-        fn=on_upload_database,
-        inputs=[db_file, state],
-        outputs=[state, upload_status, schema_md],
-    )
-    run_btn.click(
-        fn=on_generate_query,
-        inputs=[question, max_row, state],
-        outputs=[status_out, sql_out, result_out],
-    )
-if __name__ == "__main__":
-    demo.launch()

app/__init__.py ADDED Viewed

File without changes

app/main.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from dotenv import load_dotenv
+load_dotenv()
+from fastapi import FastAPI
+from app.routers import nl2sql
+app = FastAPI(
+    title="NL2SQL Copilot Prototype",
+    version="0.1.0",
+    description="Natural Language -> SQL Copilot API"
+)
+app.include_router(nl2sql.router, prefix="/api/v1")
+@app.get("/healthz")
+def health_check():
+    return {"status": "ok"}
+@app.get("/")
+def root():
+    return {"status": "ok", "message": "NL2SQL Copilot API is running"}
+@app.get("/health")
+def health():
+    return {
+        "status": "ok",
+        "db": "connected",
+        "llm": "reachable",
+        "uptime_sec": 123.4
+    }

app/routers/__init__.py ADDED Viewed

File without changes

app/routers/nl2sql.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from dataclasses import asdict, is_dataclass
+from fastapi import APIRouter, HTTPException
+from app.schemas import NL2SQLRequest, NL2SQLResponse, ClarifyResponse
+from nl2sql.pipeline import Pipeline
+from nl2sql.ambiguity_detector import AmbiguityDetector
+from nl2sql.safety import Safety
+from nl2sql.planner import Planner
+from nl2sql.generator import Generator
+from adapters.llm.openai_provider import OpenAIProvider
+from nl2sql.types import StageResult
+from nl2sql.executor import Executor
+from nl2sql.verifier import Verifier
+from nl2sql.repair import Repair
+from adapters.db.sqlite_adapter import SQLiteAdapter
+from adapters.db.postgres_adapter import PostgresAdapter
+import os
+router = APIRouter(prefix="/nl2sql")
+if os.getenv("DB_MODE", "sqlite") == "postgres":
+    _db = PostgresAdapter(os.environ["POSTGRES_DSN"])
+else:
+    _db = SQLiteAdapter("data/chinook.db")
+# --- Composition Root ---
+_llm = OpenAIProvider()
+# _db = SQLiteAdapter("data/chinook.db")
+_executor = Executor(_db)
+_verifier = Verifier()
+_repair = Repair(_llm)
+_pipeline = Pipeline(
+    detector=AmbiguityDetector(),
+    planner=Planner(_llm),
+    generator=Generator(_llm),
+    safety=Safety(),
+    executor=_executor,
+    verifier=_verifier,
+    repair=_repair
+)
+def _to_dict(obj):
+    """Helper: safely convert dataclass → dict."""
+    return asdict(obj) if is_dataclass(obj) else obj
+def _round_trace(t: dict) -> dict:
+    if t.get("cost_usd") is not None:
+        t["cost_usd"] = round(t["cost_usd"], 6)
+    if t.get("duration_ms") is not None:
+        t["duration_ms"] = round(t["duration_ms"], 2)
+    return t
+@router.post("", name="nl2sql_handler")
+def nl2sql_handler(request: NL2SQLRequest):
+    result = _pipeline.run(user_query=request.query, schema_preview=request.schema_preview)
+    # --- Ensure result type ---
+    if not isinstance(result, StageResult):
+        raise HTTPException(status_code=500, detail="Pipeline returned unexpected type")
+    data = result.data or {}
+    # --- Handle ambiguity ---
+    if isinstance(data, dict) and data.get("ambiguous") and data.get("questions"):
+        return ClarifyResponse(ambiguous=True, questions=data["questions"])
+    # --- Handle error ---
+    if not result.ok:
+        detail = "; ".join(result.error) if result.error else "Unknown error"
+        raise HTTPException(status_code=400, detail=detail)
+    # --- Success case ---
+    return NL2SQLResponse(
+        ambiguous=False,
+        sql=data.get("sql"),
+        rationale=data.get("rationale"),
+        traces=[_to_dict(t) for t in data.get("traces", [])],
+    )

app/schemas.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pydantic import BaseModel
+from typing import List, Optional, Any, Dict
+class NL2SQLRequest(BaseModel):
+    query: str
+    schema_preview: str
+    db_name: Optional[str] = "default"
+class TraceModel(BaseModel):
+    stage: str
+    duration_ms: float
+    token_in: int | None = 0
+    token_out: int | None = 0
+    cost_usd: float | None = 0
+    notes: Dict[str, Any] | None = None
+class NL2SQLResponse(BaseModel):
+    ambiguous: bool = False
+    sql: str
+    rationale: Optional[str] = None
+    traces: List[TraceModel] = []
+class ClarifyResponse(BaseModel):
+    ambiguous: bool = True
+    questions: List[str]
+class ErrorResponse(BaseModel):
+    error: str
+    details: List[str] | None = None

benchmarks/results/demo.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"query": "show all users", "exec_acc": 0.0, "safe_fail": 0.0, "latency_ms": 0.610041999607347, "cost_usd": 0.0, "repair_attempts": 0, "provider": "dummy-llm"}
+{"query": "top spenders", "exec_acc": 0.0, "safe_fail": 0.0, "latency_ms": 0.005625000085274223, "cost_usd": 0.0, "repair_attempts": 0, "provider": "dummy-llm"}
+{"query": "sum of spend", "exec_acc": 0.0, "safe_fail": 0.0, "latency_ms": 0.20833300004596822, "cost_usd": 0.0, "repair_attempts": 0, "provider": "dummy-llm"}

benchmarks/run.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# benchmarks/run.py
+from __future__ import annotations
+import argparse
+import os
+import json
+import time
+from pathlib import Path
+# ---- app imports
+from nl2sql.pipeline import Pipeline
+from nl2sql.ambiguity_detector import AmbiguityDetector
+from nl2sql.planner import Planner
+from nl2sql.generator import Generator
+from nl2sql.safety import Safety
+from nl2sql.executor import Executor
+from nl2sql.verifier import Verifier
+from nl2sql.repair import Repair
+# ---- adapters
+from adapters.db.sqlite_adapter import SQLiteAdapter
+from adapters.llm.openai_provider import OpenAIProvider
+# ---- fallbacks: Dummy LLM (so it runs without API keys)
+class DummyLLM:
+    provider_id = "dummy-llm"
+    def plan(self, *, user_query: str, schema_preview: str):
+        text = f"- understand question: {user_query}\n- identify tables\n- join if needed\n- filter\n- order/limit"
+        return text, 0, 0, 0.0
+    def generate_sql(self, *, user_query: str, schema_preview: str, plan_text: str, clarify_answers=None):
+        # naive demo SQL (so pipeline flows end-to-end)
+        sql = "SELECT 1 AS one;"
+        rationale = "Demo SQL from DummyLLM"
+        return sql, rationale, 0, 0, 0.0
+    def repair(self, *, sql: str, error_msg: str, schema_preview: str):
+        return sql, 0, 0, 0.0
+def ensure_demo_db(path: Path) -> None:
+    """Create a tiny SQLite db if missing, so executor has something to run."""
+    if path.exists():
+        return
+    import sqlite3
+    path.parent.mkdir(parents=True, exist_ok=True)
+    con = sqlite3.connect(path)
+    cur = con.cursor()
+    cur.execute("CREATE TABLE users(id INTEGER PRIMARY KEY, name TEXT, spend REAL);")
+    cur.executemany("INSERT INTO users(id,name,spend) VALUES(?,?,?)",
+                    [(1,"Alice",120.5),(2,"Bob",80.0),(3,"Carol",155.0)])
+    con.commit()
+    con.close()
+def build_pipeline(db_path: Path, use_openai: bool) -> Pipeline:
+    # DB adapter
+    db = SQLiteAdapter(str(db_path))
+    executor = Executor(db)
+    # LLM provider
+    if use_openai and os.getenv("OPENAI_API_KEY"):
+        llm = OpenAIProvider()
+    else:
+        llm = DummyLLM()
+    # stages
+    detector = AmbiguityDetector()
+    planner = Planner(llm)
+    generator = Generator(llm)
+    safety = Safety()
+    verifier = Verifier()
+    repair = Repair(llm)
+    # pipeline
+    return Pipeline(
+        detector=detector,
+        planner=planner,
+        generator=generator,
+        safety=safety,
+        executor=executor,
+        verifier=verifier,
+        repair=repair,
+    )
+def run_benchmark(queries, schema_preview, pipeline: Pipeline, outfile: Path):
+    results = []
+    for q in queries:
+        t0 = time.perf_counter()
+        r = pipeline.run(user_query=q, schema_preview=schema_preview)
+        latency_ms = (time.perf_counter()-t0)*1000
+        ok = (not r.get("ambiguous")) and ("error" not in r)
+        traces = r.get("traces", [])
+        cost_sum = 0.0
+        for t in traces:
+            try:
+                cost_sum += float(t.get("cost_usd", 0.0))
+            except Exception:
+                pass
+        results.append({
+            "query": q,
+            "exec_acc": 1.0 if ok else 0.0,
+            "safe_fail": 0.0 if ok else 1.0 if "unsafe" in str(r).lower() else 0.0,
+            "latency_ms": latency_ms,
+            "cost_usd": cost_sum,
+            "repair_attempts": sum(1 for t in traces if t.get("stage") == "repair"),
+            "provider": pipeline.generator.llm.provider_id if hasattr(pipeline.generator, "llm") else "unknown",
+        })
+    outfile.parent.mkdir(parents=True, exist_ok=True)
+    with open(outfile, "w") as f:
+        for row in results:
+            f.write(json.dumps(row) + "\n")
+    print(f"[OK] wrote {len(results)} rows → {outfile}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--outfile", default="benchmarks/results/demo.jsonl")
+    parser.add_argument("--db", default="data/bench_demo.db")
+    parser.add_argument("--use-openai", action="store_true", help="Use OpenAI provider if API key present")
+    args = parser.parse_args()
+    ROOT = Path(__file__).resolve().parents[1]   # project root
+    outfile = (ROOT / args.outfile).resolve()
+    db_path = (ROOT / args.db).resolve()
+    ensure_demo_db(db_path)
+    pipe = build_pipeline(db_path, use_openai=args.use_openai)
+    # a small demo set; replace with Spider when ready
+    queries = [
+        "show all users",
+        "top spenders",
+        "sum of spend",
+    ]
+    schema_preview = "CREATE TABLE users(id INT, name TEXT, spend REAL);"
+    run_benchmark(queries, schema_preview, pipe, outfile)
+if __name__ == "__main__":
+    main()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+version: "3.9"
+services:
+  postgres:
+    image: postgres:16
+    container_name: nl2sql_pg
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_DB: demo
+    volumes:
+      - pgdata:/var/lib/postgresql/data
+      - ./infra/migrate.sql:/docker-entrypoint-initdb.d/00_init.sql:ro
+    ports:
+      - "5432:5432"
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U postgres -d demo"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+  api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: nl2sql_api
+    depends_on:
+      postgres:
+        condition: service_healthy
+    environment:
+      DB_MODE: postgres
+      POSTGRES_DSN: dbname=demo user=postgres password=postgres host=postgres port=5432
+      OPENAI_MODEL_ID: gpt-4o-mini
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+    ports:
+      - "8000:8000"
+    command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers"]
+volumes:
+  pgdata:

infra/migrate.sql ADDED Viewed

	@@ -0,0 +1,8 @@

+CREATE TABLE IF NOT EXISTS users (
+    id SERIAL PRIMARY KEY,
+    name TEXT NOT NULL,
+    city TEXT
+);
+INSERT INTO users (name, city)
+VALUES ('Alice', 'Tehran'), ('Bob', 'Karaj'), ('Caro', 'Isfahan');

logs/spider_eval/dev_gold_1760430884.txt DELETED Viewed

@@ -1,10 +0,0 @@
-SELECT count(*) FROM singer	concert_singer
-SELECT count(*) FROM singer	concert_singer
-SELECT name ,  country ,  age FROM singer ORDER BY age DESC	concert_singer
-SELECT name ,  country ,  age FROM singer ORDER BY age DESC	concert_singer
-SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'	concert_singer
-SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'	concert_singer
-SELECT song_name ,  song_release_year FROM singer ORDER BY age LIMIT 1	concert_singer
-SELECT song_name ,  song_release_year FROM singer ORDER BY age LIMIT 1	concert_singer
-SELECT DISTINCT country FROM singer WHERE age  >  20	concert_singer
-SELECT DISTINCT country FROM singer WHERE age  >  20	concert_singer

logs/spider_eval/dev_metrics_1760430884.json DELETED Viewed

@@ -1,15 +0,0 @@
-{
-  "commit_hash": "e207f417ac5923220817e3c3f61c72e51a98c63b",
-  "split": "dev",
-  "limit": 10,
-  "total_examples": 10,
-  "valid_examples": 10,
-  "exact_match_rate": 0.2,
-  "exact_match_structural_rate": 0.0,
-  "execution_accuracy_rate": 0.8,
-  "error_rate": 0.0,
-  "safe_check_fail_rate": 0.0,
-  "avg_gen_time": 1.4374850749969483,
-  "avg_exec_time": 0.0007865667343139648,
-  "run_id": 1760430884
-}

logs/spider_eval/dev_pred_1760430884.txt DELETED Viewed

@@ -1,10 +0,0 @@
-SELECT COUNT(*) AS total_singers FROM singer;	concert_singer
-SELECT COUNT(*) AS total_singers FROM singer;	concert_singer
-SELECT Name, Country, Age FROM singer ORDER BY Age DESC	concert_singer
-SELECT Name, Country, Age FROM singer ORDER BY Age DESC	concert_singer
-SELECT AVG(Age) AS average_age, MIN(Age) AS minimum_age, MAX(Age) AS maximum_age FROM singer WHERE Country = 'France'	concert_singer
-SELECT AVG(Age) AS average_age, MIN(Age) AS minimum_age, MAX(Age) AS maximum_age FROM singer WHERE Country = 'France';	concert_singer
-SELECT Name, Song_Name, Song_release_year FROM singer WHERE Age = (SELECT MAX(Age) FROM singer)	concert_singer
-SELECT Song_Name, Song_release_year FROM singer WHERE Age = (SELECT MAX(Age) FROM singer)	concert_singer
-SELECT DISTINCT Country FROM singer WHERE Age > 20	concert_singer
-SELECT DISTINCT Country FROM singer WHERE Age > 20	concert_singer

logs/spider_eval/dev_results_1760430884.jsonl DELETED Viewed

@@ -1,11 +0,0 @@
-# {"commit_hash": "e207f417ac5923220817e3c3f61c72e51a98c63b", "split": "dev", "limit": 10, "start_time": 1760430884}
-{"db_id": "concert_singer", "question": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "SELECT COUNT(*) AS total_singers FROM singer;", "status": "Returned 1 row(s). Showing up to 50.", "output": "[\n  {\n    \"total_singers\": 6\n  }\n]", "gen_time": 1.2182981967926025, "exec_time": 0.0008916854858398438, "error": null, "gold_error": null, "pred_rows": "[(6,)]", "gold_rows": "[(6,)]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "SELECT COUNT(*) AS total_singers FROM singer;", "status": "Returned 1 row(s). Showing up to 50.", "output": "[\n  {\n    \"total_singers\": 6\n  }\n]", "gen_time": 1.261944055557251, "exec_time": 0.00044798851013183594, "error": null, "gold_error": null, "pred_rows": "[(6,)]", "gold_rows": "[(6,)]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "SELECT Name, Country, Age FROM singer ORDER BY Age DESC", "status": "Returned 6 row(s). Showing up to 50.", "output": "[\n  {\n    \"Name\": \"Joe Sharp\",\n    \"Country\": \"Netherlands\",\n    \"Age\": 52\n  },\n  {\n    \"Name\": \"John Nizinik\",\n    \"Country\": \"France\",\n    \"Age\": 43\n  },\n  {\n    \"Name\": \"Rose White\",\n    \"Country\": \"France\",\n    \"Age\": 41\n  },\n  {\n    \"Name\": \"Timbaland\",\n    \"Country\": \"United States\",\n    \"Age\": 32\n  },\n  {\n    \"Name\": \"Justin Brown\",\n    \"Country\": \"France\",\n    \"Age\": 29\n  },\n  {\n    \"Name\": \"Tribal King\",\n    \"Country\": \"France\",\n    \"Age\": 25\n  }\n]", "gen_time": 1.0276496410369873, "exec_time": 0.0006437301635742188, "error": null, "gold_error": null, "pred_rows": "[('Joe Sharp', 'Netherlands', 52), ('John Nizinik', 'France', 43), ('Rose White', 'France', 41), ('Timbaland', 'United States', 32), ('Justin Brown', 'France', 29), ('Tribal King', 'France', 25)]", "gold_rows": "[('Joe Sharp', 'Netherlands', 52), ('John Nizinik', 'France', 43), ('Rose White', 'France', 41), ('Timbaland', 'United States', 32), ('Justin Brown', 'France', 29), ('Tribal King', 'France', 25)]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "SELECT Name, Country, Age FROM singer ORDER BY Age DESC", "status": "Returned 6 row(s). Showing up to 50.", "output": "[\n  {\n    \"Name\": \"Joe Sharp\",\n    \"Country\": \"Netherlands\",\n    \"Age\": 52\n  },\n  {\n    \"Name\": \"John Nizinik\",\n    \"Country\": \"France\",\n    \"Age\": 43\n  },\n  {\n    \"Name\": \"Rose White\",\n    \"Country\": \"France\",\n    \"Age\": 41\n  },\n  {\n    \"Name\": \"Timbaland\",\n    \"Country\": \"United States\",\n    \"Age\": 32\n  },\n  {\n    \"Name\": \"Justin Brown\",\n    \"Country\": \"France\",\n    \"Age\": 29\n  },\n  {\n    \"Name\": \"Tribal King\",\n    \"Country\": \"France\",\n    \"Age\": 25\n  }\n]", "gen_time": 1.4854280948638916, "exec_time": 0.0007121562957763672, "error": null, "gold_error": null, "pred_rows": "[('Joe Sharp', 'Netherlands', 52), ('John Nizinik', 'France', 43), ('Rose White', 'France', 41), ('Timbaland', 'United States', 32), ('Justin Brown', 'France', 29), ('Tribal King', 'France', 25)]", "gold_rows": "[('Joe Sharp', 'Netherlands', 52), ('John Nizinik', 'France', 43), ('Rose White', 'France', 41), ('Timbaland', 'United States', 32), ('Justin Brown', 'France', 29), ('Tribal King', 'France', 25)]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "pred_sql": "SELECT AVG(Age) AS average_age, MIN(Age) AS minimum_age, MAX(Age) AS maximum_age FROM singer WHERE Country = 'France'", "status": "Returned 1 row(s). Showing up to 50.", "output": "[\n  {\n    \"average_age\": 34.5,\n    \"minimum_age\": 25,\n    \"maximum_age\": 43\n  }\n]", "gen_time": 1.8432340621948242, "exec_time": 0.0009491443634033203, "error": null, "gold_error": null, "pred_rows": "[(34.5, 25, 43)]", "gold_rows": "[(34.5, 25, 43)]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "What is the average, minimum, and maximum age for all French singers?", "gold_sql": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "pred_sql": "SELECT AVG(Age) AS average_age, MIN(Age) AS minimum_age, MAX(Age) AS maximum_age FROM singer WHERE Country = 'France';", "status": "Returned 1 row(s). Showing up to 50.", "output": "[\n  {\n    \"average_age\": 34.5,\n    \"minimum_age\": 25,\n    \"maximum_age\": 43\n  }\n]", "gen_time": 2.5529589653015137, "exec_time": 0.0012962818145751953, "error": null, "gold_error": null, "pred_rows": "[(34.5, 25, 43)]", "gold_rows": "[(34.5, 25, 43)]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "Show the name and the release year of the song by the youngest singer.", "gold_sql": "SELECT song_name ,  song_release_year FROM singer ORDER BY age LIMIT 1", "pred_sql": "SELECT Name, Song_Name, Song_release_year FROM singer WHERE Age = (SELECT MAX(Age) FROM singer)", "status": "Returned 1 row(s). Showing up to 50.", "output": "[\n  {\n    \"Name\": \"Joe Sharp\",\n    \"Song_Name\": \"You\",\n    \"Song_release_year\": \"1992\"\n  }\n]", "gen_time": 1.2428169250488281, "exec_time": 0.0007181167602539062, "error": null, "gold_error": null, "pred_rows": "[('Joe Sharp', 'You', '1992')]", "gold_rows": "[('Love', '2016')]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": false, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "What are the names and release years for all the songs of the youngest singer?", "gold_sql": "SELECT song_name ,  song_release_year FROM singer ORDER BY age LIMIT 1", "pred_sql": "SELECT Song_Name, Song_release_year FROM singer WHERE Age = (SELECT MAX(Age) FROM singer)", "status": "Returned 1 row(s). Showing up to 50.", "output": "[\n  {\n    \"Song_Name\": \"You\",\n    \"Song_release_year\": \"1992\"\n  }\n]", "gen_time": 1.4568238258361816, "exec_time": 0.0009098052978515625, "error": null, "gold_error": null, "pred_rows": "[('You', '1992')]", "gold_rows": "[('Love', '2016')]", "exact_match": false, "exact_match_structural": false, "execution_accuracy": false, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "What are all distinct countries where singers above age 20 are from?", "gold_sql": "SELECT DISTINCT country FROM singer WHERE age  >  20", "pred_sql": "SELECT DISTINCT Country FROM singer WHERE Age > 20", "status": "Returned 3 row(s). Showing up to 50.", "output": "[\n  {\n    \"Country\": \"Netherlands\"\n  },\n  {\n    \"Country\": \"United States\"\n  },\n  {\n    \"Country\": \"France\"\n  }\n]", "gen_time": 0.9801719188690186, "exec_time": 0.0007050037384033203, "error": null, "gold_error": null, "pred_rows": "[('Netherlands',), ('United States',), ('France',)]", "gold_rows": "[('Netherlands',), ('United States',), ('France',)]", "exact_match": true, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}
-{"db_id": "concert_singer", "question": "What are  the different countries with singers above age 20?", "gold_sql": "SELECT DISTINCT country FROM singer WHERE age  >  20", "pred_sql": "SELECT DISTINCT Country FROM singer WHERE Age > 20", "status": "Returned 3 row(s). Showing up to 50.", "output": "[\n  {\n    \"Country\": \"Netherlands\"\n  },\n  {\n    \"Country\": \"United States\"\n  },\n  {\n    \"Country\": \"France\"\n  }\n]", "gen_time": 1.3055250644683838, "exec_time": 0.0005917549133300781, "error": null, "gold_error": null, "pred_rows": "[('Netherlands',), ('United States',), ('France',)]", "gold_rows": "[('Netherlands',), ('United States',), ('France',)]", "exact_match": true, "exact_match_structural": false, "execution_accuracy": true, "safe_check_failed": false}

nl2sql/__init__.py ADDED Viewed

File without changes

nl2sql/ambiguity_detector.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import re
+from typing import List
+class AmbiguityDetector:
+    """Lightweight AmbiSQL-style ambiguity detection."""
+    AMBIGUOUS_TERMS = ["recent", "top", "name", "rank", "latest"]
+    def detect(self, query:str, schema_preview: str) -> list[str]:
+        hits = []
+        q_lower = query.lower()
+        for term in self.AMBIGUOUS_TERMS:
+            if re.search(rf"\b{term}\b", q_lower):
+                hits.append(f"The term '{term}' is ambiguous in this query.'")
+        return hits

nl2sql/executor.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import time
+from nl2sql.types import StageResult, StageTrace
+from adapters.db.base import DBAdapter
+class Executor:
+    name = "executor"
+    def __init__(self, db: DBAdapter):
+        self.db = db
+    def run(self, sql: str) -> StageResult:
+        t0 = time.perf_counter()
+        try:
+            rows, cols = self.db.execute(sql)
+            trace = StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000,
+                               notes={"row_count": len(rows), "col_count": len(cols)})
+            return StageResult(ok=True, data={"rows": rows, "columns": cols}, trace=trace)
+        except Exception as e:
+            trace = StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000,
+                               notes={"error": str(e)})
+            return StageResult(ok=False, data=None, trace=trace, error=[str(e)])

nl2sql/generator.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from __future__ import annotations
+import time
+from typing import Optional, Dict, Any
+from nl2sql.types import StageResult, StageTrace
+from adapters.llm.base import LLMProvider
+class Generator:
+    name = "generator"
+    def __init__(self, llm: LLMProvider) -> None:
+        self.llm = llm
+    def run(self, *, user_query: str, schema_preview: str, plan_text: str,
+            clarify_answers: Optional[Dict[str, Any]] = None) -> StageResult:
+        t0 = time.perf_counter()
+        try:
+            res = self.llm.generate_sql(
+                user_query=user_query,
+                schema_preview=schema_preview,
+                plan_text=plan_text,
+                clarify_answers=clarify_answers or {}
+            )
+        except Exception as e:
+            return StageResult(ok=False, error=[f"Generator failed: {e}"])
+        # Expect a 5-tuple
+        if not isinstance(res, tuple) or len(res) != 5:
+            return StageResult(ok=False, error=["Generator contract violation: expected 5-tuple (sql, rationale, t_in, t_out, cost)"])
+        sql, rationale, t_in, t_out, cost = res
+        # Type/shape checks
+        if not isinstance(sql, str) or not sql.strip():
+            return StageResult(ok=False, error=["Generator produced empty or non-string SQL"])
+        if not sql.lower().lstrip().startswith("select"):
+            return StageResult(ok=False, error=[f"Generated non-SELECT SQL: {sql}"])
+        rationale = rationale or ""  # safe length
+        trace = StageTrace(
+            stage=self.name,
+            duration_ms=(time.perf_counter() - t0) * 1000.0,
+            token_in=t_in,
+            token_out=t_out,
+            cost_usd=cost,
+            notes={"rationale_len": len(rationale)},
+        )
+        return StageResult(ok=True, data={"sql": sql, "rationale": rationale}, trace=trace)

nl2sql/pipeline.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from __future__ import annotations
+import traceback
+from typing import Dict, Any, Optional, List
+from nl2sql.types import StageResult
+from nl2sql.ambiguity_detector import AmbiguityDetector
+from nl2sql.planner import Planner
+from nl2sql.generator import Generator
+from nl2sql.safety import Safety
+from nl2sql.executor import Executor
+from nl2sql.verifier import Verifier
+from nl2sql.repair import Repair
+class Pipeline:
+    """
+    NL2SQL Copilot pipeline with guaranteed dict output.
+    All stages return structured traces and errors but final result is JSON-safe dict.
+    """
+    def __init__(self, *,
+                 detector: AmbiguityDetector,
+                 planner: Planner,
+                 generator: Generator,
+                 safety: Safety,
+                 executor: Executor,
+                 verifier: Verifier,
+                 repair: Repair):
+        self.detector = detector
+        self.planner = planner
+        self.generator = generator
+        self.safety = safety
+        self.executor = executor
+        self.verifier = verifier
+        self.repair = repair
+    # ------------------------------------------------------------
+    def _trace_list(self, *stages: StageResult) -> List[dict]:
+        traces = []
+        for s in stages:
+            if not s:
+                continue
+            t = getattr(s, "trace", None)
+            if t:
+                traces.append(t.__dict__)
+        return traces
+    # ------------------------------------------------------------
+    def _safe_stage(self, fn, **kwargs) -> StageResult:
+        """Run a stage safely; if it throws, catch and convert to StageResult."""
+        try:
+            r = fn(**kwargs)
+            if isinstance(r, StageResult):
+                return r
+            else:
+                # not ideal, but wrap it
+                return StageResult(ok=True, data=r, trace=None)
+        except Exception as e:
+            tb = traceback.format_exc()
+            return StageResult(ok=False, data=None, trace=None, errors=[f"{e}", tb])
+    # ------------------------------------------------------------
+    def run(self, *, user_query: str, schema_preview: str,
+            clarify_answers: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """
+        Always returns:
+        {
+            "ambiguous": bool,
+            "error": bool,
+            "details": list[str] | None,
+            "sql": str | None,
+            "rationale": str | None,
+            "verified": bool | None,
+            "traces": list[dict]
+        }
+        """
+        traces: List[dict] = []
+        details: List[str] = []
+        sql, rationale, verified = None, None, None
+        # --- 1) ambiguity detection
+        try:
+            questions = self.detector.detect(user_query, schema_preview)
+            if questions:
+                return {
+                    "ambiguous": True,
+                    "error": False,
+                    "details": [f"Ambiguities found: {len(questions)}"],
+                    "questions": questions,
+                    "traces": []
+                }
+        except Exception as e:
+            return {"ambiguous": True, "error": True, "details": [f"Detector failed: {e}"], "traces": []}
+        # --- 2) planner
+        r_plan = self._safe_stage(self.planner.run, user_query=user_query, schema_preview=schema_preview)
+        traces.extend(self._trace_list(r_plan))
+        if not r_plan.ok:
+            return {"ambiguous": False, "error": True, "details": r_plan.errors, "traces": traces}
+        # --- 3) generator
+        r_gen = self._safe_stage(self.generator.run,
+                                 user_query=user_query,
+                                 schema_preview=schema_preview,
+                                 plan_text=r_plan.data.get("plan"),
+                                 clarify_answers=clarify_answers or {})
+        traces.extend(self._trace_list(r_gen))
+        if not r_gen.ok:
+            return {"ambiguous": False, "error": True, "details": r_gen.errors, "traces": traces}
+        sql = r_gen.data.get("sql")
+        rationale = r_gen.data.get("rationale")
+        # --- 4) safety
+        r_safe = self._safe_stage(self.safety.check, sql=sql)
+        traces.extend(self._trace_list(r_safe))
+        if not r_safe.ok:
+            return {"ambiguous": False, "error": True, "details": r_safe.errors, "traces": traces}
+        # --- 5) executor
+        r_exec = self._safe_stage(self.executor.run, sql=r_safe.data["sql"])
+        traces.extend(self._trace_list(r_exec))
+        if not r_exec.ok:
+            details.extend(r_exec.errors or [])
+        # --- 6) verifier
+        r_ver = self._safe_stage(self.verifier.run, sql=sql, exec_result=r_exec)
+        traces.extend(self._trace_list(r_ver))
+        verified = bool(r_ver.ok)
+        # --- 7) repair loop if verification failed
+        if not verified:
+            for attempt in range(2):
+                r_fix = self._safe_stage(self.repair.run,
+                                         sql=sql,
+                                         error_msg="; ".join(details or ["unknown"]),
+                                         schema_preview=schema_preview)
+                traces.extend(self._trace_list(r_fix))
+                if not r_fix.ok:
+                    break
+                sql = r_fix.data.get("sql")
+                r_safe = self._safe_stage(self.safety.check, sql=sql)
+                traces.extend(self._trace_list(r_safe))
+                if not r_safe.ok:
+                    details.extend(r_safe.errors or [])
+                    continue
+                r_exec = self._safe_stage(self.executor.run, sql=r_safe.data["sql"])
+                traces.extend(self._trace_list(r_exec))
+                if not r_exec.ok:
+                    details.extend(r_exec.errors or [])
+                    continue
+                r_ver = self._safe_stage(self.verifier.run, sql=sql, exec_result=r_exec)
+                traces.extend(self._trace_list(r_ver))
+                verified = bool(r_ver.ok)
+                if verified:
+                    break
+        # --- Final result dict
+        return {
+            "ambiguous": False,
+            "error": len(details) > 0 and not verified,
+            "details": details or None,
+            "sql": sql,
+            "rationale": rationale,
+            "verified": verified,
+            "traces": traces,
+        }

nl2sql/planner.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from __future__ import annotations
+import time
+from nl2sql.types import StageResult, StageTrace
+from adapters.llm.base import LLMProvider
+class Planner:
+    name = "planner"
+    def __init__(self, llm: LLMProvider) -> None:
+        self.llm = llm
+    def run(self, *, user_query: str, schema_preview: str) -> StageResult:
+        t0 = time.perf_counter()
+        plan_text, t_in, t_out, cost = self.llm.plan(user_query=user_query, schema_preview=schema_preview)
+        trace = StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000,
+                           token_in=t_in, token_out=t_out, cost_usd=cost, notes={"len_plan": len(plan_text)})
+        return StageResult(ok=True, data={"plan": plan_text}, trace=trace)

nl2sql/repair.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import time
+from nl2sql.types import StageTrace, StageResult
+from adapters.llm.base import LLMProvider
+GUIDELINES = """
+When repairing:
+1. Keep query SELECT-only.
+2. Explicitly qualify ambiguous columns with table names.
+3. Match GROUP BY fields with aggregations.
+4. Use known foreign keys for JOIN.
+5. Add a reasonable LIMIT if missing.
+Return only the corrected SQL.
+"""
+class Repair:
+    name = "repair"
+    def __init__(self, llm: LLMProvider):
+        self.llm = llm
+    def run(self, sql:str, error_msg: str, schema_preview: str) -> StageResult:
+        t0 = time.perf_counter()
+        fixed_sql, t_in, t_out, cost = self.llm.repair(sql=sql, error_msg=f"{GUIDELINES}\n\n{error_msg}",
+                                                      schema_preview=schema_preview)
+        trace = StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000,
+                           token_in=t_in, token_out=t_out, cost_usd=cost,
+                           notes={"old_sql_len": len(sql), "new_sql_len": len(fixed_sql)})
+        return StageResult(ok=True, data={"sql": fixed_sql}, trace=trace)

nl2sql/safety.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from __future__ import annotations
+import re, time
+from nl2sql.types import StageResult, StageTrace
+# --- Regex utils ---
+_COMMENT_BLOCK = re.compile(r"/\*.*?\*/", re.DOTALL)
+_COMMENT_LINE  = re.compile(r"--.*?$", re.MULTILINE)
+# string literals (single & double quotes), allow escaped quotes
+_STRING_SINGLE = re.compile(r"'([^'\\]|\\.)*'", re.DOTALL)
+_STRING_DOUBLE = re.compile(r'"([^"\\]|\\.)*"', re.DOTALL)
+# case-insensitive, word-boundary forbidden keywords
+_FORBIDDEN = re.compile(
+    r"\b(delete|update|insert|drop|create|alter|attach|pragma|reindex|vacuum|replace|grant|revoke|execute)\b",
+    re.IGNORECASE,
+)
+# allow: SELECT ...   or   WITH <cte...> SELECT ...
+_ALLOW_SELECT = re.compile(r"^(?:WITH\b.*?\)\s*)?SELECT\b", re.IGNORECASE | re.DOTALL)
+def _strip_comments(s: str) -> str:
+    s = _COMMENT_BLOCK.sub(" ", s)
+    s = _COMMENT_LINE.sub(" ", s)
+    return s
+def _mask_strings(s: str) -> str:
+    s = _STRING_SINGLE.sub("'X'", s)
+    s = _STRING_DOUBLE.sub('"X"', s)
+    return s
+def _split_statements(s: str) -> list[str]:
+    parts = [p.strip() for p in s.split(";")]
+    return [p for p in parts if p]
+class Safety:
+    name = "safety"
+    def check(self, sql: str) -> StageResult:
+        t0 = time.perf_counter()
+        print("🧩 SQL candidate:", sql)
+        s = _strip_comments(sql)
+        s = _mask_strings(s).strip()
+        stmts = _split_statements(s)
+        if len(stmts) != 1:
+            return StageResult(
+                ok=False,
+                error=["Multiple statements detected"],
+                trace=StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000),
+            )
+        body = stmts[0]
+        if _FORBIDDEN.search(body):
+            return StageResult(
+                ok=False,
+                error=["Forbidden keyword detected"],
+                trace=StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000),
+            )
+        if not _ALLOW_SELECT.match(body):
+            return StageResult(
+                ok=False,
+                error=["Non-SELECT statement"],
+                trace=StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000),
+            )
+        return StageResult(
+            ok=True,
+            data={
+                "sql": sql.strip(),
+                "rationale": "Statement validated as SELECT-only (strings/comments ignored).",
+            },
+            trace=StageTrace(stage=self.name, duration_ms=(time.perf_counter()-t0)*1000),
+        )

nl2sql/stubs.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from nl2sql.types import StageResult, StageTrace
+class NoOpExecutor:
+    name = "executor"
+    def run(self, sql: str) -> StageResult:
+        # pretend success, return empty result set
+        return StageResult(
+            ok=True,
+            data={"rows": [], "columns": []},
+            trace=StageTrace(stage=self.name, duration_ms=0.0, notes={"noop": True})
+        )
+class NoOpVerifier:
+    name = "verifier"
+    def run(self, sql: str, exec_result: StageResult) -> StageResult:
+        # always verified for legacy tests
+        return StageResult(
+            ok=True,
+            data={"verified": True},
+            trace=StageTrace(stage=self.name, duration_ms=0.0, notes={"noop": True})
+        )
+class NoOpRepair:
+    name = "repair"
+    def run(self, sql: str, error_msg: str, schema_preview: str) -> StageResult:
+        # return original SQL unchanged
+        return StageResult(
+            ok=True,
+            data={"sql": sql},
+            trace=StageTrace(stage=self.name, duration_ms=0.0, notes={"noop": True})
+        )

nl2sql/types.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from dataclasses import dataclass
+from typing import Any, Dict, Optional, List
+@dataclass(frozen=True)
+class StageTrace:
+    stage: str
+    duration_ms: float
+    notes: Optional[Dict[str, Any]] = None
+    token_in: Optional[int] = None
+    token_out: Optional[int] = None
+    cost_usd: Optional[float] = None
+@dataclass(frozen=True)
+class StageResult:
+    ok: bool
+    data: Optional[Any] = None
+    trace: Optional[StageTrace] = None
+    error: Optional[List[str]] = None
+    notes: Optional[Dict[str, Any]] = None

nl2sql/verifier.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import sqlglot
+from sqlglot import expressions as exp
+from nl2sql.types import StageResult, StageTrace
+class Verifier:
+    name = "verifier"
+    def run(self, sql: str, exec_result: StageResult) -> StageResult:
+        if not exec_result.ok:
+            return StageResult(ok=False, data=None,
+                               trace=StageTrace(stage=self.name, duration_ms=0,
+                               notes={"reason": "execution_error"}),
+                               error=exec_result.errors)
+        # Rule 1: check SELECT / GROUP consistency
+        issues = []
+        try:
+            tree = sqlglot.parse_one(sql)
+            if isinstance(tree, exp.Select):
+                group = tree.args.get("group")
+                aggs = [a for a in tree.find_all(exp.AggFunc)]
+                if aggs and not group:
+                    issues.append("Aggregation without GROUP BY.")
+        except Exception as e:
+            issues.append(f"Parse error during verification: {e}")
+        if issues:
+            return StageResult(ok=False, data=None,
+                               trace=StageTrace(stage=self.name, duration_ms=0,
+                               notes={"issues": issues}),
+                               error=issues)
+        return StageResult(ok=True, data={"verified": True},
+                           trace=StageTrace(stage=self.name, duration_ms=0))

requirements.txt CHANGED Viewed

@@ -1,8 +1,11 @@
-gradio
-langchain
-langchain-openai
-langchain_community
-sqlglot
-openai
-python-dotenv
-dotenv

+fastapi==0.115.2
+uvicorn[standard]==0.30.6
+pydantic==2.9.2
+sqlglot==27.26.0
+requests==2.32.3
+streamlit==1.39.0
+plotly==5.24.1
+pytest==8.3.3
+python-dotenv==1.1.1
+openai==2.6.1
+psycopg[binary]~=3.2

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+from dotenv import load_dotenv
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ENV_PATH = os.path.join(ROOT_DIR, ".env")
+load_dotenv(dotenv_path=ENV_PATH)

tests/test_ambiguity.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from nl2sql.ambiguity_detector import AmbiguityDetector
+from nl2sql.types import StageResult
+from app.routers import nl2sql
+def test_detects_ambiguous_terms():
+    det = AmbiguityDetector()
+    res = det.detect("Show me recent top singers", "table: singer(id,name,age)")
+    assert len(res) >= 1
+    assert "recent" in res[0].lower()
+def test_not_false_positive():
+    det = AmbiguityDetector()
+    res = det.detect("List all singers older than 30", "table: singer(id, name, age)")
+    assert res == []
+def test_ambiguity_response():
+    fake_result = StageResult(ok=True, data={"ambiguous": True, "questions": ["Clarify column?"]})
+    response = nl2sql._to_dict(fake_result.data)
+    assert response["ambiguous"] is True

tests/test_executor.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from nl2sql.executor import Executor
+from adapters.db.sqlite_adapter import SQLiteAdapter
+def test_executor_runs_select(tmp_path):
+    db_path = tmp_path / "test.db"
+    import sqlite3
+    conn = sqlite3.connect(db_path)
+    conn.execute("CREATE TABLE users(id INT, name TEXT);")
+    conn.execute("INSERT INTO users VALUES (1, 'Alice');")
+    conn.commit()
+    conn.close()
+    ex = Executor(SQLiteAdapter(str(db_path)))
+    res = ex.run("SELECT * FROM users;")
+    assert res.ok
+    assert res.data["rows"][0][1] == "Alice"

tests/test_generator.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import pytest
+from nl2sql.generator import Generator
+from nl2sql.types import StageResult
+# --- Dummy LLMs (respect the 5-tuple contract) --------------------------------
+class LLM_OK:
+    def generate_sql(self, **kwargs):
+        # contract: (sql, rationale, t_in, t_out, cost)
+        return "SELECT * FROM singer;", "list all", 10, 5, 0.00001
+class LLM_EMPTY_SQL:
+    def generate_sql(self, **kwargs):
+        # empty SQL → should be error
+        return "", "reason", 10, 5, 0.0
+class LLM_NON_SELECT:
+    def generate_sql(self, **kwargs):
+        # non-SELECT SQL → should be error
+        return "UPDATE users SET name='x' WHERE id=1;", "bad", 8, 3, 0.0
+class LLM_CONTRACT_NONE:
+    def generate_sql(self, **kwargs):
+        # contract violation: None instead of 5-tuple
+        return None
+class LLM_CONTRACT_SHORT:
+    def generate_sql(self, **kwargs):
+        # contract violation: too few items
+        return ("SELECT * FROM singer;", "list all")  # only 2
+# --- Parametrized negative cases ----------------------------------------------
+@pytest.mark.parametrize(
+    "llm, err_keyword",
+    [
+        (LLM_EMPTY_SQL(), "empty"),          # empty or non-string sql
+        (LLM_NON_SELECT(), "non-select"),    # generated non-SELECT
+        (LLM_CONTRACT_NONE(), "contract violation"),
+        (LLM_CONTRACT_SHORT(), "contract violation"),
+    ],
+)
+def test_generator_errors_do_not_create_trace(llm, err_keyword):
+    gen = Generator(llm=llm)
+    r = gen.run(
+        user_query="show all singers",
+        schema_preview="CREATE TABLE singer(id int, name text);",
+        plan_text="-- plan --",
+        clarify_answers={}
+    )
+    assert isinstance(r, StageResult)
+    assert r.ok is False
+    # Error message is flexible; just check a keyword
+    joined = " ".join(r.error or []).lower()
+    assert err_keyword in joined
+    # On errors, Generator should not attach a trace (we measure only successful stage)
+    assert r.trace is None
+# --- Positive case (success) ---------------------------------------------------
+def test_generator_success_has_valid_trace_and_data():
+    gen = Generator(llm=LLM_OK())
+    r = gen.run(
+        user_query="show all singers",
+        schema_preview="CREATE TABLE singer(id int, name text);",
+        plan_text="-- plan --",
+        clarify_answers={}
+    )
+    # Basic success checks
+    assert isinstance(r, StageResult)
+    assert r.ok is True
+    assert r.data and r.data["sql"].lower().startswith("select")
+    assert "rationale" in r.data
+    # Trace should exist and be coherent
+    assert r.trace is not None
+    assert r.trace.stage == "generator"
+    assert isinstance(r.trace.duration_ms, float)
+    assert r.trace.token_in == 10
+    assert r.trace.token_out == 5
+    # cost can be float or None depending on provider; if present must be numeric
+    if r.trace.cost_usd is not None:
+        assert isinstance(r.trace.cost_usd, float)
+    # Optional notes check – rationale_len should match length of rationale
+    notes = r.trace.notes or {}
+    if "rationale_len" in notes:
+        assert notes["rationale_len"] == len(r.data.get("rationale", ""))

tests/test_nl2sql_router.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import pytest
+from fastapi.testclient import TestClient
+from app.main import app
+from nl2sql.types import StageResult, StageTrace
+client = TestClient(app)
+def fake_trace(stage: str):
+    return StageTrace(stage=stage, duration_ms=10.0)
+path = app.url_path_for("nl2sql_handler")
+# --- 1) Clarify / ambiguity case ---------------------------------------------
+def test_ambiguity_route(monkeypatch):
+    from app.routers import nl2sql
+    # mock pipeline to return StageResult with ambiguous=True
+    def fake_run(*args, **kwargs):
+        return StageResult(
+            ok=True,
+            data={
+                "ambiguous": True,
+                "questions": ["Which table do you mean?"],
+                "traces": [fake_trace("detector")],
+            },
+        )
+    monkeypatch.setattr(nl2sql._pipeline, "run", fake_run)
+    resp = client.post(
+        path,
+        json={
+            "query": "show all records",
+            "schema_preview": "CREATE TABLE ...",
+        },
+    )
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["ambiguous"] is True
+    assert "questions" in data
+# --- 2) Error / failure case -------------------------------------------------
+def test_error_route(monkeypatch):
+    from app.routers import nl2sql
+    def fake_run(*args, **kwargs):
+        return StageResult(ok=False, error=["Bad SQL"], data={"traces": [fake_trace("safety")]})
+    monkeypatch.setattr(nl2sql._pipeline, "run", fake_run)
+    resp = client.post(
+        path,
+        json={
+            "query": "drop table users;",
+            "schema_preview": "CREATE TABLE users(id int);",
+        },
+    )
+    assert resp.status_code == 400
+    assert "Bad SQL" in resp.json()["detail"]
+# --- 3) Success / happy path -------------------------------------------------
+def test_success_route(monkeypatch):
+    from app.routers import nl2sql
+    def fake_run(*args, **kwargs):
+        return StageResult(
+            ok=True,
+            data={
+                "ambiguous": False,
+                "sql": "SELECT * FROM users;",
+                "rationale": "Simple listing",
+                "traces": [fake_trace("planner"), fake_trace("generator")],
+            },
+        )
+    monkeypatch.setattr(nl2sql._pipeline, "run", fake_run)
+    resp = client.post(
+        path,
+        json={
+            "query": "show all users",
+            "schema_preview": "CREATE TABLE users(id int, name text);",
+        },
+    )
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["sql"].lower().startswith("select")
+    assert isinstance(data["traces"], list)
+    assert any(t["stage"] == "planner" for t in data["traces"])

tests/test_openai_provider.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json
+import pytest
+from adapters.llm.openai_provider import OpenAIProvider
+# Helper class to fake the completion object returned by OpenAI SDK
+class FakeCompletion:
+    def __init__(self, content: str, prompt_tokens=5, completion_tokens=7):
+        self.choices = [type("Choice", (), {"message": type("Msg", (), {"content": content})})]
+        self.usage = type("Usage", (), {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens
+        })
+# --- Case 1: clean valid JSON --------------------------------------------------
+def test_generate_sql_valid_json(monkeypatch):
+    provider = OpenAIProvider()
+    fake_content = json.dumps({
+        "sql": "SELECT * FROM singer;",
+        "rationale": "List all singers."
+    })
+    fake_completion = FakeCompletion(fake_content)
+    # Monkeypatch client.chat.completions.create
+    def fake_create(*args, **kwargs):
+        return fake_completion
+    monkeypatch.setattr(provider.client.chat.completions, "create", fake_create)
+    sql, rationale, t_in, t_out, cost = provider.generate_sql(
+        user_query="show all singers",
+        schema_preview="CREATE TABLE singer(id int, name text);",
+        plan_text="-- plan --",
+        clarify_answers={}
+    )
+    assert sql.strip().lower().startswith("select")
+    assert "singer" in sql.lower()
+    assert "list" in rationale.lower()
+    assert t_in == 5 and t_out == 7
+    assert isinstance(cost, float)
+# --- Case 2: malformed JSON with extra text (should still recover) ------------
+def test_generate_sql_recover_from_partial_json(monkeypatch):
+    provider = OpenAIProvider()
+    # invalid JSON with text around it
+    fake_content = "Here is the result:\n{ \"sql\": \"SELECT * FROM users;\", \"rationale\": \"list users\" }\nThanks!"
+    fake_completion = FakeCompletion(fake_content)
+    def fake_create(*args, **kwargs):
+        return fake_completion
+    monkeypatch.setattr(provider.client.chat.completions, "create", fake_create)
+    sql, rationale, *_ = provider.generate_sql(
+        user_query="show all users",
+        schema_preview="CREATE TABLE users(id int, name text);",
+        plan_text="-- plan --"
+    )
+    assert sql.lower().startswith("select")
+    assert "user" in sql.lower()
+    assert "list" in rationale.lower()
+# --- Case 3: completely invalid JSON (should raise ValueError) ----------------
+def test_generate_sql_invalid_json(monkeypatch):
+    provider = OpenAIProvider()
+    fake_content = "This is nonsense output without braces"
+    fake_completion = FakeCompletion(fake_content)
+    def fake_create(*args, **kwargs):
+        return fake_completion
+    monkeypatch.setattr(provider.client.chat.completions, "create", fake_create)
+    with pytest.raises(ValueError):
+        provider.generate_sql(
+            user_query="show X",
+            schema_preview="CREATE TABLE t(id int);",
+            plan_text="-- plan --"
+        )

tests/test_pipeline_integration.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import pytest
+from nl2sql.pipeline import Pipeline
+from nl2sql.types import StageResult, StageTrace
+# --- Dummy stages to isolate pipeline -----------------------------------------
+class DummyDetector:
+    """Simulates ambiguity detector stage."""
+    def __init__(self, ambiguous=False):
+        self.ambiguous = ambiguous
+    def detect(self, user_query, schema_preview):
+        # If ambiguous=True, return clarification questions
+        return ["Which column?"] if self.ambiguous else []
+class DummyPlanner:
+    """Simulates planner stage."""
+    def run(self, *, user_query, schema_preview):
+        trace = StageTrace(stage="planner", duration_ms=1.0)
+        if "fail_plan" in user_query:
+            return StageResult(ok=False, error=["Planner failed"], trace=trace)
+        return StageResult(ok=True, data={"plan": "plan text"}, trace=trace)
+class DummyGenerator:
+    """Simulates generator stage."""
+    def run(self, *, user_query, schema_preview, plan_text, clarify_answers):
+        trace = StageTrace(stage="generator", duration_ms=1.0)
+        if "fail_gen" in user_query:
+            return StageResult(ok=False, error=["Generator failed"], trace=trace)
+        sql = "SELECT * FROM singer;"
+        rationale = "List all singers."
+        return StageResult(ok=True, data={"sql": sql, "rationale": rationale}, trace=trace)
+class DummySafety:
+    """Simulates safety stage."""
+    def check(self, sql):
+        trace = StageTrace(stage="safety", duration_ms=1.0)
+        if "DROP" in sql.upper():
+            return StageResult(ok=False, error=["Unsafe SQL"], trace=trace)
+        return StageResult(ok=True, data={"sql": sql, "rationale": "safe"}, trace=trace)
+# --- 1) Success path ----------------------------------------------------------
+def test_pipeline_success():
+    pipeline = Pipeline(
+        detector=DummyDetector(ambiguous=False),
+        planner=DummyPlanner(),
+        generator=DummyGenerator(),
+        safety=DummySafety()
+    )
+    r = pipeline.run(
+        user_query="show all singers",
+        schema_preview="CREATE TABLE singer(id int, name text);"
+    )
+    assert isinstance(r, StageResult)
+    assert r.ok is True
+    data = r.data or {}
+    assert data["sql"].lower().startswith("select")
+    assert any(t.stage == "planner" for t in data["traces"])
+    assert any(t.stage == "generator" for t in data["traces"])
+    assert any(t.stage == "safety" for t in data["traces"])
+# --- 2) Ambiguity case --------------------------------------------------------
+def test_pipeline_ambiguity():
+    pipeline = Pipeline(
+        detector=DummyDetector(ambiguous=True),
+        planner=DummyPlanner(),
+        generator=DummyGenerator(),
+        safety=DummySafety()
+    )
+    r = pipeline.run(
+        user_query="show data",
+        schema_preview="CREATE TABLE x(id int);"
+    )
+    assert isinstance(r, StageResult)
+    assert r.ok is True
+    assert r.data["ambiguous"] is True
+    assert isinstance(r.data["questions"], list)
+# --- 3) Planner failure -------------------------------------------------------
+def test_pipeline_plan_fail():
+    pipeline = Pipeline(
+        detector=DummyDetector(),
+        planner=DummyPlanner(),
+        generator=DummyGenerator(),
+        safety=DummySafety()
+    )
+    r = pipeline.run(
+        user_query="fail_plan",
+        schema_preview="CREATE TABLE singer(id int);"
+    )
+    assert isinstance(r, StageResult)
+    assert r.ok is False
+    assert "Planner failed" in " ".join(r.error or [])
+# --- 4) Generator failure -----------------------------------------------------
+def test_pipeline_gen_fail():
+    pipeline = Pipeline(
+        detector=DummyDetector(),
+        planner=DummyPlanner(),
+        generator=DummyGenerator(),
+        safety=DummySafety()
+    )
+    r = pipeline.run(
+        user_query="fail_gen",
+        schema_preview="CREATE TABLE singer(id int);"
+    )
+    assert r.ok is False
+    assert "Generator failed" in " ".join(r.error or [])
+# --- 5) Safety failure --------------------------------------------------------
+def test_pipeline_safety_fail():
+    class UnsafeGen(DummyGenerator):
+        def run(self, **kw):
+            trace = StageTrace(stage="generator", duration_ms=1.0)
+            # Generate a DROP TABLE → unsafe
+            return StageResult(ok=True, data={"sql": "DROP TABLE x;", "rationale": "oops"}, trace=trace)
+    pipeline = Pipeline(
+        detector=DummyDetector(),
+        planner=DummyPlanner(),
+        generator=UnsafeGen(),
+        safety=DummySafety()
+    )
+    r = pipeline.run(
+        user_query="drop something",
+        schema_preview="CREATE TABLE x(id int);"
+    )
+    assert r.ok is False
+    assert "unsafe" in " ".join(r.error or []).lower()

tests/test_safety.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from nl2sql.safety import Safety
+import pytest
+def test_safety_allows_select():
+    s = Safety()
+    result = s.check("SELECT * FROM users;")
+    assert result.ok
+    assert "sql" in result.data
+    assert result.trace.stage == "safety"
+def test_safety_allows_with_select_cte():
+    s = Safety()
+    sql = """
+    WITH recent AS (
+      SELECT id FROM users WHERE created_at > '2024-01-01'
+    )
+    SELECT * FROM users u JOIN recent r ON u.id = r.id;
+    """
+    r = s.check(sql)
+    assert r.ok
+def test_safety_allows_select_with_comments_and_newlines():
+    s = Safety()
+    sql = "/* head */ \n -- inline\n SELECT 1; -- tail"
+    r = s.check(sql)
+    assert r.ok
+def test_safety_allows_keywords_inside_string_literals():
+    s = Safety()
+    sql = "SELECT 'DROP TABLE x' as note, 'delete from y' as text;"
+    r = s.check(sql)
+    assert r.ok, r.error
+def test_safety_blocks_delete():
+    s = Safety()
+    result = s.check("DELETE FROM users;")
+    assert not result.ok
+    assert any("Forbidden" in e or "Non-SELECT" in e for e in (result.error or []))
+@pytest.mark.parametrize("sql", [
+    "UPDATE users SET name='X' WHERE id=1;",
+    "INSERT INTO users(id) VALUES (1);",
+    "DROP TABLE users;",
+    "CREATE TABLE x(id INT);",
+    "ALTER TABLE users ADD COLUMN x INT;",
+    "ATTACH DATABASE 'hack.db' AS h;",
+    "PRAGMA journal_mode=WAL;",
+])
+def test_safety_blocks_forbidden_statements(sql):
+    s = Safety()
+    res = s.check(sql)
+    assert not res.ok
+def test_safety_blocks_stacked_delete_after_select():
+    s = Safety()
+    sql = "SELECT * FROM users; DELETE FROM users;"
+    r = s.check(sql)
+    assert not r.ok
+def test_safety_blocks_stacked_delete_with_spaces():
+    s = Safety()
+    sql = "SELECT * FROM users ;   \n  DELETE users;"
+    r = s.check(sql)
+    assert not r.ok
+def test_safety_blocks_delete_inside_cte():
+    s = Safety()
+    sql = """
+    WITH bad AS (DELETE FROM users)
+    SELECT * FROM users;
+    """
+    r = s.check(sql)
+    assert not r.ok
+@pytest.mark.parametrize("sql", [
+    "/*D*/ROP TABLE users;",
+    "PR/*x*/AGMA journal_mode=WAL;",
+    "AL/* comment */TER TABLE x ADD COLUMN y INT;",
+])
+def test_safety_blocks_comment_obfuscation(sql):
+    s = Safety()
+    r = s.check(sql)
+    assert not r.ok
+@pytest.mark.parametrize("sql", [
+    "pragma journal_mode=WAL;",  # lower-case
+    "  PRAGMA  user_version = 5 ; ",
+    "\nATTACH DATABASE 'hack.db' AS h;",
+])
+def test_safety_blocks_forbidden_case_and_spacing(sql):
+    s = Safety()
+    r = s.check(sql)
+    assert not r.ok
+def test_safety_blocks_multiple_nonempty_statements_even_if_second_is_comment():
+    s = Safety()
+    sql = "SELECT 1;  -- now do something bad\n"
+    sql_bad = "SELECT 1;  /* spacer */  DROP TABLE x;"
+    assert s.check(sql).ok
+    assert not s.check(sql_bad).ok

tests/test_stage_types.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from nl2sql.types import StageResult, StageTrace
+def test_error_response():
+    r = StageResult(ok=False, error=["Syntax error"])
+    assert not r.ok
+    assert r.error == ["Syntax error"]
+def test_trace_dataclass_structure():
+    t = StageTrace(stage="planner", duration_ms=12.5, token_in=10, token_out=20)
+    assert t.stage == "planner"
+    assert isinstance(t.duration_ms, float)
+    assert t.token_out == 20
+def test_stage_result_defaults():
+    r = StageResult(ok=True)
+    assert r.ok
+    assert r.data is None
+    assert r.error is None

ui/benchmark_app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import json
+import pandas as pd
+import streamlit as st
+import plotly.express as px
+from pathlib import Path
+st.set_page_config(page_title="NL2SQL Benchmark Dashboard", layout="wide")
+st.title("📊 NL2SQL Copilot – Benchmark Dashboard")
+# 1. Load results
+result_files = list(Path("benchmarks/results").glob("*.jsonl"))
+if not result_files:
+    st.warning("No benchmark result files found in benchmarks/results/")
+    st.stop()
+file = st.selectbox("Select benchmark file", result_files)
+rows = [json.loads(l) for l in open(file)]
+df = pd.DataFrame(rows)
+# 2. Summary metrics
+st.subheader("Aggregate Metrics")
+col1, col2, col3, col4 = st.columns(4)
+col1.metric("Total Queries", len(df))
+col2.metric("Execution Accuracy", f"{df['exec_acc'].mean()*100:.1f}%")
+col3.metric("Safety Violations", f"{df['safe_fail'].mean()*100:.1f}%")
+col4.metric("Average Latency (ms)", f"{df['latency_ms'].mean():.0f}")
+# 3. Latency Distribution
+st.subheader("Latency Distribution")
+fig1 = px.histogram(df, x="latency_ms", nbins=30, title="Latency Histogram")
+st.plotly_chart(fig1, use_container_width=True)
+# 4. Cost vs Accuracy
+st.subheader("Cost vs Execution Accuracy")
+fig2 = px.scatter(df, x="cost_usd", y="exec_acc", color="provider",
+                  title="Trade-off: Cost vs Accuracy", hover_data=["query"])
+st.plotly_chart(fig2, use_container_width=True)
+# 5. Repair Stats
+if "repair_attempts" in df.columns:
+    st.subheader("Repair Attempts")
+    fig3 = px.bar(df.groupby("repair_attempts").size().reset_index(name="count"),
+                  x="repair_attempts", y="count", title="Number of Repair Attempts per Query")
+    st.plotly_chart(fig3, use_container_width=True)