Spaces:
Running
Running
| """Run a vanilla Ollama Cloud model through the eval harness (small batch). | |
| Completes the eval matrix with a real LLM row alongside no-op/heuristic/oracle, so we | |
| can see whether a fine-tune is needed and how big the gap is. | |
| uv run eval/run_model.py --n 12 --model glm-5.1 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| from scrubdata.model_planner import make_ollama_planner | |
| from scrubdata.planner import mock_plan | |
| from .gold import load_gold | |
| from .run_eval import evaluate | |
| def main() -> None: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--n", type=int, default=12) | |
| ap.add_argument("--seed", type=int, default=4242) | |
| ap.add_argument("--model", type=str, default="glm-5.1") | |
| args = ap.parse_args() | |
| gold = load_gold()[:args.n] | |
| systems = { | |
| "ORACLE (gold)": lambda df, gp: gp, | |
| "HEURISTIC": lambda df, gp: mock_plan(df), | |
| f"VANILLA {args.model}": make_ollama_planner(args.model), | |
| } | |
| rows = {name: evaluate(fn, gold) for name, fn in systems.items()} | |
| cols = ["json_valid", "op_f1", "canon_f1", "canon_r", "recovery"] | |
| print(f"\nModel eval on {args.n} held-out examples (seed {args.seed})\n") | |
| print(f"{'system':<22}" + "".join(f"{c:>11}" for c in cols)) | |
| print("-" * (22 + 11 * len(cols))) | |
| for name, m in rows.items(): | |
| print(f"{name:<22}" + "".join(f"{m[c]:>11.3f}" for c in cols)) | |
| print("\nNote: a vanilla model scores low canon_f1/recovery mostly from CONVENTION " | |
| "mismatch (its canonical forms/ops differ from our executor's) — which is " | |
| "exactly what fine-tuning aligns.") | |
| if __name__ == "__main__": | |
| main() | |