scrubdata / eval /capture_plan_local.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
3.74 kB
"""Capture a raw v6 model plan LOCALLY (Ollama Q8_0 GGUF) for a Raha dataset.
Mirrors the Modal capture composition (scripts/modal_eval_v5.py --capture):
make_batched_planner(base, batch_size=4), greedy, no grounded wrapper, no union β€”
verification/union happen downstream (eval/raha_table.py, eval/precision_curve.py).
DISCLOSED deltas vs the Modal captures: (1) Q8_0 GGUF on local Ollama instead of the
bf16 merged adapter on A100 β€” quantization may shift individual mappings; (2) Ollama
format=json instead of generate(suppress_tokens=[151657,151658]) β€” both exist solely
to block the degenerate <tool_call> first token (without either, generation loops).
Prereq: ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
ollama create scrubdata-ft -f notebooks/Modelfile
uv run python -m eval.capture_plan_local --dataset beers
Writes eval/results/v6_<dataset>_raw_plan_localq8.json.
"""
from __future__ import annotations
import argparse
import json
import time
from pathlib import Path
from scrubdata.model_planner import _extract_json, make_batched_planner
from .run_real_multi import _raha_pair
def make_json_constrained_planner(model: str, host: str = "http://localhost:11434",
timeout: int = 600):
"""Local Ollama planner with format=json (grammar-constrained decoding)."""
import urllib.request
from scrubdata.profiler import profile_dataframe
from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
def planner(dirty_df, *_):
user = build_user_prompt(profile_dataframe(dirty_df), dirty_df)
payload = {
"model": model, "stream": False, "format": "json",
"messages": [{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user}],
"options": {"temperature": 0, "num_predict": 2000, "num_ctx": 16384},
}
req = urllib.request.Request(
host + "/api/chat", data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"})
try:
with urllib.request.urlopen(req, timeout=timeout) as r:
out = json.loads(r.read())["message"]["content"]
except Exception as e: # noqa: BLE001
print(f" batch failed: {str(e)[:80]}", flush=True)
return {"__error__": str(e)[:120]}
plan = _extract_json(out)
if plan is None:
print(f" batch returned no JSON: {out[:80]!r}", flush=True)
return {"__error__": "no_json"}
plan.setdefault("table_operations", [])
plan.setdefault("columns", [])
plan.setdefault("flags", [])
return plan
return planner
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True)
ap.add_argument("--model", default="scrubdata-ft")
ap.add_argument("--timeout", type=int, default=600)
args = ap.parse_args()
dirty, _clean = _raha_pair(args.dataset) # same table the scorer sees
print(f"capturing plan: {args.dataset} ({len(dirty)} rows x {dirty.shape[1]} cols)",
flush=True)
t0 = time.time()
plan = make_batched_planner(make_json_constrained_planner(args.model, timeout=args.timeout),
batch_size=4)(dirty)
dt = time.time() - t0
n_ops = sum(len(c.get("operations", [])) for c in plan.get("columns", []))
print(f"done in {dt:.0f}s β€” {len(plan.get('columns', []))} columns, {n_ops} ops")
out = (Path(__file__).resolve().parent / "results"
/ f"v6_{args.dataset}_raw_plan_localq8.json")
json.dump(plan, open(out, "w"), indent=1)
print(f"written to {out}")
if __name__ == "__main__":
main()