scrubdata / eval /minicpm_check.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
3.51 kB
"""OpenBMB MiniCPM gate check β€” drops a zero-shot MiniCPM planner into the IDENTICAL
hospital verify(tau=0.5)+union gate the scaling arm uses, and scores it against the
shipped Qwen3-4B fine-tune bar (union 0.905@0.413).
Reuses eval/scaling_arm.py wholesale (make_cloud_planner hits /api/chat, so a LOCAL
ollama model id works through the same contract β€” no cloud proxy involved here). Does
not modify scaling_arm's existing rows; writes eval/results/minicpm_check.json.
uv run python -m eval.minicpm_check
"""
from __future__ import annotations
import json
import time
from pathlib import Path
from scrubdata.model_planner import make_batched_planner
from .run_real import _ensure_data, _load
from .scaling_arm import (PRIOR_4B_ROW, RESULTS, make_cloud_planner, score_point)
MODEL = "hf.co/openbmb/MiniCPM3-4B-GGUF:Q4_K_M"
PARAMS_B = 4
FAMILY = "openbmb/minicpm3 (zero-shot, local)"
TAU = 0.5
def main() -> None:
_ensure_data()
dirty, clean = _load()
print(f"=== {MODEL} ({FAMILY}, {PARAMS_B}B) β€” hospital, 509 real errors ===",
flush=True)
# local model, served by the same daemon at localhost:11434 via /api/chat;
# bigger num_predict so a chatty 4B isn't truncated mid-JSON
raw, stats = make_cloud_planner(MODEL, num_predict=4000)
batched = make_batched_planner(raw, batch_size=4)
t0 = time.time()
try:
raw_plan = batched(dirty)
except Exception as e:
raw_plan = {"__error__": str(e)[:200]}
runtime = round(time.time() - t0, 1)
validity = round(stats["valid"] / stats["calls"], 3) if stats["calls"] else 0.0
plan_path = RESULTS / "scaling_minicpm3_4b_hospital_raw_plan.json"
json.dump(raw_plan, open(plan_path, "w"))
row = {"model": MODEL, "params_b": PARAMS_B, "family": FAMILY,
"validity": validity, "runtime_s": runtime,
"n_calls": stats["calls"], "errors": stats["errors"][:5],
"provenance": "this run (ZERO-SHOT, local RTX 3060 Ti via ollama "
"hf.co/openbmb/MiniCPM3-4B-GGUF:Q4_K_M, num_predict=4000)",
"raw_plan": str(plan_path.relative_to(RESULTS.parent.parent))}
if isinstance(raw_plan, dict) and "__error__" not in raw_plan:
row.update(score_point(dirty, clean, raw_plan, tau=TAU))
else:
row.update({"gated_prec": None, "gated_cov": None,
"union_prec": None, "union_cov": None,
"verifier_kept": None, "verifier_dropped": None,
"note": "planner produced no plan"})
print(f" validity {validity} runtime {runtime}s "
f"gated {row.get('gated_prec')}/{row.get('gated_cov')} "
f"union {row.get('union_prec')}/{row.get('union_cov')} "
f"kept/dropped {row.get('verifier_kept')}/{row.get('verifier_dropped')}",
flush=True)
bar = PRIOR_4B_ROW
out = {
"task": "OpenBMB MiniCPM gate check β€” hospital 509 real errors, tau=0.5, "
"v6 gate protocol (batched raw plan -> verify -> union). "
"ZERO-SHOT MiniCPM vs FINE-TUNED Qwen3-4B bar.",
"bar": {"model": bar["model"], "union_prec": bar["union_prec"],
"union_cov": bar["union_cov"], "gated_prec": bar["gated_prec"],
"gated_cov": bar["gated_cov"]},
"rows": [dict(bar), row],
}
json.dump(out, open(RESULTS / "minicpm_check.json", "w"), indent=1)
print("\nwritten:", RESULTS / "minicpm_check.json")
if __name__ == "__main__":
main()