Spaces:
Running
Running
| """OpenBMB MiniCPM gate check β drops a zero-shot MiniCPM planner into the IDENTICAL | |
| hospital verify(tau=0.5)+union gate the scaling arm uses, and scores it against the | |
| shipped Qwen3-4B fine-tune bar (union 0.905@0.413). | |
| Reuses eval/scaling_arm.py wholesale (make_cloud_planner hits /api/chat, so a LOCAL | |
| ollama model id works through the same contract β no cloud proxy involved here). Does | |
| not modify scaling_arm's existing rows; writes eval/results/minicpm_check.json. | |
| uv run python -m eval.minicpm_check | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import time | |
| from pathlib import Path | |
| from scrubdata.model_planner import make_batched_planner | |
| from .run_real import _ensure_data, _load | |
| from .scaling_arm import (PRIOR_4B_ROW, RESULTS, make_cloud_planner, score_point) | |
| MODEL = "hf.co/openbmb/MiniCPM3-4B-GGUF:Q4_K_M" | |
| PARAMS_B = 4 | |
| FAMILY = "openbmb/minicpm3 (zero-shot, local)" | |
| TAU = 0.5 | |
| def main() -> None: | |
| _ensure_data() | |
| dirty, clean = _load() | |
| print(f"=== {MODEL} ({FAMILY}, {PARAMS_B}B) β hospital, 509 real errors ===", | |
| flush=True) | |
| # local model, served by the same daemon at localhost:11434 via /api/chat; | |
| # bigger num_predict so a chatty 4B isn't truncated mid-JSON | |
| raw, stats = make_cloud_planner(MODEL, num_predict=4000) | |
| batched = make_batched_planner(raw, batch_size=4) | |
| t0 = time.time() | |
| try: | |
| raw_plan = batched(dirty) | |
| except Exception as e: | |
| raw_plan = {"__error__": str(e)[:200]} | |
| runtime = round(time.time() - t0, 1) | |
| validity = round(stats["valid"] / stats["calls"], 3) if stats["calls"] else 0.0 | |
| plan_path = RESULTS / "scaling_minicpm3_4b_hospital_raw_plan.json" | |
| json.dump(raw_plan, open(plan_path, "w")) | |
| row = {"model": MODEL, "params_b": PARAMS_B, "family": FAMILY, | |
| "validity": validity, "runtime_s": runtime, | |
| "n_calls": stats["calls"], "errors": stats["errors"][:5], | |
| "provenance": "this run (ZERO-SHOT, local RTX 3060 Ti via ollama " | |
| "hf.co/openbmb/MiniCPM3-4B-GGUF:Q4_K_M, num_predict=4000)", | |
| "raw_plan": str(plan_path.relative_to(RESULTS.parent.parent))} | |
| if isinstance(raw_plan, dict) and "__error__" not in raw_plan: | |
| row.update(score_point(dirty, clean, raw_plan, tau=TAU)) | |
| else: | |
| row.update({"gated_prec": None, "gated_cov": None, | |
| "union_prec": None, "union_cov": None, | |
| "verifier_kept": None, "verifier_dropped": None, | |
| "note": "planner produced no plan"}) | |
| print(f" validity {validity} runtime {runtime}s " | |
| f"gated {row.get('gated_prec')}/{row.get('gated_cov')} " | |
| f"union {row.get('union_prec')}/{row.get('union_cov')} " | |
| f"kept/dropped {row.get('verifier_kept')}/{row.get('verifier_dropped')}", | |
| flush=True) | |
| bar = PRIOR_4B_ROW | |
| out = { | |
| "task": "OpenBMB MiniCPM gate check β hospital 509 real errors, tau=0.5, " | |
| "v6 gate protocol (batched raw plan -> verify -> union). " | |
| "ZERO-SHOT MiniCPM vs FINE-TUNED Qwen3-4B bar.", | |
| "bar": {"model": bar["model"], "union_prec": bar["union_prec"], | |
| "union_cov": bar["union_cov"], "gated_prec": bar["gated_prec"], | |
| "gated_cov": bar["gated_cov"]}, | |
| "rows": [dict(bar), row], | |
| } | |
| json.dump(out, open(RESULTS / "minicpm_check.json", "w"), indent=1) | |
| print("\nwritten:", RESULTS / "minicpm_check.json") | |
| if __name__ == "__main__": | |
| main() | |