Spaces:

liovina
/

nl-sql

Sleeping

File size: 3,949 Bytes

"""One-shot audit: re-score every stored record under the fixed runner.

Reads a baseline/voting eval JSON, re-executes each `pred_sql` + `gold_sql`
through `_execute_gold` + `execute_readonly`, recomputes `match` via
`compare_results`, and reports every qid where the stored flag disagrees
with the fresh computation.

Use this to validate that the SQLAlchemy `:identifier` bind-bug fix
(see commit 8aa7544) did not leave residual false positives or false
negatives anywhere in the n=200 evaluation surface.

Example:
    uv run python scripts/audit_rescore.py \
        --report eval/reports/2026-05-18/v16-helallao-dac-reasoning.json
"""

from __future__ import annotations

import argparse
import json
from pathlib import Path

from nl_sql.db import DatabaseSpec
from nl_sql.db.connection import execute_readonly, sqlite_url_readonly
from nl_sql.eval.metrics.execution_accuracy import safe_compare_pred
from nl_sql.eval.runner import _execute_gold


def main() -> int:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("--report", type=Path, required=True)
    p.add_argument(
        "--data-root",
        type=Path,
        default=Path("data/bird_mini_dev/MINIDEV/dev_databases"),
    )
    args = p.parse_args()

    data = json.loads(args.report.read_text(encoding="utf-8"))
    records = data["records"] if isinstance(data, dict) else data

    mismatches: list[dict[str, object]] = []
    for r in records:
        db_id = r.get("db_id")
        db_path = args.data_root / db_id / f"{db_id}.sqlite"
        spec = DatabaseSpec(id=db_id, dialect="sqlite", url=sqlite_url_readonly(db_path))
        engine = spec.make_engine()
        try:
            gold_rows, _ = _execute_gold(
                engine, r["gold_sql"], statement_timeout_ms=30_000, row_cap=10_000
            )
            pred_sql = r.get("pred_sql") or ""
            pred_rows: list = []
            pred_failed = False
            if pred_sql.strip():
                try:
                    with execute_readonly(
                        engine, pred_sql, statement_timeout_ms=30_000, row_cap=10_000
                    ) as result:
                        pred_rows = list(result.rows)
                except Exception:
                    pred_rows = []
                    pred_failed = True
                cmp = safe_compare_pred(
                    gold_rows, pred_rows, gold_sql=r["gold_sql"], pred_failed=pred_failed
                )
                true_match = bool(cmp.match)
                reason = cmp.reason
            else:
                true_match = False
                reason = "empty prediction"
            stored = bool(r.get("match"))
            if stored != true_match:
                mismatches.append(
                    {
                        "qid": r["question_id"],
                        "difficulty": r.get("difficulty"),
                        "db_id": db_id,
                        "stored_match": stored,
                        "true_match": true_match,
                        "gold_rows": len(gold_rows),
                        "pred_rows": len(pred_rows),
                        "reason": reason,
                    }
                )
        finally:
            engine.dispose()

    matched_stored = sum(1 for r in records if r.get("match"))
    matched_true = matched_stored + sum(1 if m["true_match"] else -1 for m in mismatches)
    print(f"Report: {args.report}")
    print(f"  records: {len(records)}")
    print(f"  matches stored: {matched_stored}")
    print(f"  matches true:   {matched_true}")
    print(f"  mismatches:     {len(mismatches)}")
    for m in mismatches:
        print(
            f"    qid={m['qid']:>5} {m['difficulty']:11s} stored={m['stored_match']} → true={m['true_match']} (gold={m['gold_rows']}, pred={m['pred_rows']}) reason={m['reason']!r}"
        )
    return 0 if not mismatches else 1


if __name__ == "__main__":
    raise SystemExit(main())