Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 11 days ago

1.65 kB

	"""Run a vanilla Ollama Cloud model through the eval harness (small batch).

	Completes the eval matrix with a real LLM row alongside no-op/heuristic/oracle, so we
	can see whether a fine-tune is needed and how big the gap is.

	uv run eval/run_model.py --n 12 --model glm-5.1
	"""

	from __future__ import annotations

	import argparse

	from scrubdata.model_planner import make_ollama_planner
	from scrubdata.planner import mock_plan

	from .gold import load_gold
	from .run_eval import evaluate


	def main() -> None:
	ap = argparse.ArgumentParser()
	ap.add_argument("--n", type=int, default=12)
	ap.add_argument("--seed", type=int, default=4242)
	ap.add_argument("--model", type=str, default="glm-5.1")
	args = ap.parse_args()

	gold = load_gold()[:args.n]

	systems = {
	"ORACLE (gold)": lambda df, gp: gp,
	"HEURISTIC": lambda df, gp: mock_plan(df),
	f"VANILLA {args.model}": make_ollama_planner(args.model),
	}
	rows = {name: evaluate(fn, gold) for name, fn in systems.items()}

	cols = ["json_valid", "op_f1", "canon_f1", "canon_r", "recovery"]
	print(f"\nModel eval on {args.n} held-out examples (seed {args.seed})\n")
	print(f"{'system':<22}" + "".join(f"{c:>11}" for c in cols))
	print("-" * (22 + 11 * len(cols)))
	for name, m in rows.items():
	print(f"{name:<22}" + "".join(f"{m[c]:>11.3f}" for c in cols))
	print("\nNote: a vanilla model scores low canon_f1/recovery mostly from CONVENTION "
	"mismatch (its canonical forms/ops differ from our executor's) — which is "
	"exactly what fine-tuning aligns.")


	if __name__ == "__main__":
	main()