Spaces:

build-small-hackathon
/

scrubdata

Running

scrubdata / eval /minicpm_check.py

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 19 days ago

3.51 kB

	"""OpenBMB MiniCPM gate check — drops a zero-shot MiniCPM planner into the IDENTICAL
	hospital verify(tau=0.5)+union gate the scaling arm uses, and scores it against the
	shipped Qwen3-4B fine-tune bar (union 0.905@0.413).

	Reuses eval/scaling_arm.py wholesale (make_cloud_planner hits /api/chat, so a LOCAL
	ollama model id works through the same contract — no cloud proxy involved here). Does
	not modify scaling_arm's existing rows; writes eval/results/minicpm_check.json.

	uv run python -m eval.minicpm_check
	"""

	from __future__ import annotations

	import json
	import time
	from pathlib import Path

	from scrubdata.model_planner import make_batched_planner

	from .run_real import _ensure_data, _load
	from .scaling_arm import (PRIOR_4B_ROW, RESULTS, make_cloud_planner, score_point)

	MODEL = "hf.co/openbmb/MiniCPM3-4B-GGUF:Q4_K_M"
	PARAMS_B = 4
	FAMILY = "openbmb/minicpm3 (zero-shot, local)"
	TAU = 0.5


	def main() -> None:
	_ensure_data()
	dirty, clean = _load()

	print(f"=== {MODEL} ({FAMILY}, {PARAMS_B}B) — hospital, 509 real errors ===",
	flush=True)
	# local model, served by the same daemon at localhost:11434 via /api/chat;
	# bigger num_predict so a chatty 4B isn't truncated mid-JSON
	raw, stats = make_cloud_planner(MODEL, num_predict=4000)
	batched = make_batched_planner(raw, batch_size=4)
	t0 = time.time()
	try:
	raw_plan = batched(dirty)
	except Exception as e:
	raw_plan = {"__error__": str(e)[:200]}
	runtime = round(time.time() - t0, 1)
	validity = round(stats["valid"] / stats["calls"], 3) if stats["calls"] else 0.0

	plan_path = RESULTS / "scaling_minicpm3_4b_hospital_raw_plan.json"
	json.dump(raw_plan, open(plan_path, "w"))

	row = {"model": MODEL, "params_b": PARAMS_B, "family": FAMILY,
	"validity": validity, "runtime_s": runtime,
	"n_calls": stats["calls"], "errors": stats["errors"][:5],
	"provenance": "this run (ZERO-SHOT, local RTX 3060 Ti via ollama "
	"hf.co/openbmb/MiniCPM3-4B-GGUF:Q4_K_M, num_predict=4000)",
	"raw_plan": str(plan_path.relative_to(RESULTS.parent.parent))}
	if isinstance(raw_plan, dict) and "__error__" not in raw_plan:
	row.update(score_point(dirty, clean, raw_plan, tau=TAU))
	else:
	row.update({"gated_prec": None, "gated_cov": None,
	"union_prec": None, "union_cov": None,
	"verifier_kept": None, "verifier_dropped": None,
	"note": "planner produced no plan"})

	print(f" validity {validity} runtime {runtime}s "
	f"gated {row.get('gated_prec')}/{row.get('gated_cov')} "
	f"union {row.get('union_prec')}/{row.get('union_cov')} "
	f"kept/dropped {row.get('verifier_kept')}/{row.get('verifier_dropped')}",
	flush=True)

	bar = PRIOR_4B_ROW
	out = {
	"task": "OpenBMB MiniCPM gate check — hospital 509 real errors, tau=0.5, "
	"v6 gate protocol (batched raw plan -> verify -> union). "
	"ZERO-SHOT MiniCPM vs FINE-TUNED Qwen3-4B bar.",
	"bar": {"model": bar["model"], "union_prec": bar["union_prec"],
	"union_cov": bar["union_cov"], "gated_prec": bar["gated_prec"],
	"gated_cov": bar["gated_cov"]},
	"rows": [dict(bar), row],
	}
	json.dump(out, open(RESULTS / "minicpm_check.json", "w"), indent=1)
	print("\nwritten:", RESULTS / "minicpm_check.json")


	if __name__ == "__main__":
	main()