scrubdata / eval /run_model.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
1.65 kB
"""Run a vanilla Ollama Cloud model through the eval harness (small batch).
Completes the eval matrix with a real LLM row alongside no-op/heuristic/oracle, so we
can see whether a fine-tune is needed and how big the gap is.
uv run eval/run_model.py --n 12 --model glm-5.1
"""
from __future__ import annotations
import argparse
from scrubdata.model_planner import make_ollama_planner
from scrubdata.planner import mock_plan
from .gold import load_gold
from .run_eval import evaluate
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--n", type=int, default=12)
ap.add_argument("--seed", type=int, default=4242)
ap.add_argument("--model", type=str, default="glm-5.1")
args = ap.parse_args()
gold = load_gold()[:args.n]
systems = {
"ORACLE (gold)": lambda df, gp: gp,
"HEURISTIC": lambda df, gp: mock_plan(df),
f"VANILLA {args.model}": make_ollama_planner(args.model),
}
rows = {name: evaluate(fn, gold) for name, fn in systems.items()}
cols = ["json_valid", "op_f1", "canon_f1", "canon_r", "recovery"]
print(f"\nModel eval on {args.n} held-out examples (seed {args.seed})\n")
print(f"{'system':<22}" + "".join(f"{c:>11}" for c in cols))
print("-" * (22 + 11 * len(cols)))
for name, m in rows.items():
print(f"{name:<22}" + "".join(f"{m[c]:>11.3f}" for c in cols))
print("\nNote: a vanilla model scores low canon_f1/recovery mostly from CONVENTION "
"mismatch (its canonical forms/ops differ from our executor's) — which is "
"exactly what fine-tuning aligns.")
if __name__ == "__main__":
main()