"""Evaluate the SHIPPED system (verified union planner) on the wide validation suite (churn-neutral metric) — the model row for the paper's money table. Loads the adapter (Modal volume, default /vol/v5_seed21 = v6) merged into bf16 and wraps it in EXACTLY the active.py composition: batched + RACOON-grounded, then per-entry verifier (tau=0.5), then union with the grounded heuristic — paper-product identity. Runs the suite's REAL slice (5 Raha benchmarks) plus the typo-injected slice (the canonicalization regime the model is for). Single seed (the CI row comes from the cheap heuristic systems); scoped honestly. uv run modal run --detach scripts/modal_eval_suite.py # shipped (v6+union) uv run modal run --detach scripts/modal_eval_suite.py --no-union # bare grounded model """ import modal IGNORE = [".venv/**", ".git/**", "*.gguf", "**/__pycache__/**", ".gstack/**", "design/**", "frontend/variant_*/**", "notebooks/**", ".pytest_cache/**", "data/**"] image = ( modal.Image.debian_slim(python_version="3.11") .pip_install("torch", "transformers>=4.45", "peft", "accelerate", "pandas", "jsonschema", "pycountry", "sentencepiece") .add_local_dir(".", "/root/repo", ignore=IGNORE, copy=True) .add_local_dir("data/real/cache", "/root/repo/data/real/cache", copy=True) .add_local_file("training/unpaired_sources.json", "/root/repo/training/unpaired_sources.json", copy=True) ) app = modal.App("scrubdata-eval-suite", image=image) adapter_vol = modal.Volume.from_name("scrubdata-v5-adapter") results = modal.Dict.from_name("scrubdata-suite-results", create_if_missing=True) @app.function(gpu="A100-80GB", timeout=4 * 3600, volumes={"/vol": adapter_vol}) def run_suite(seed: int = 7, adapter: str = "/vol/v5_seed21", union: bool = True): import os, sys, torch os.chdir("/root/repo") sys.path.insert(0, "/root/repo") from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt from scrubdata.profiler import profile_dataframe from scrubdata.model_planner import _extract_json, make_batched_planner from scrubdata.grounded import make_grounded_planner from scrubdata.executor import apply_plan from eval.run_real_multi import build_suite, score, _cell_only, abstain_slice base_id = "unsloth/Qwen3-4B-Instruct-2507" tok = AutoTokenizer.from_pretrained(base_id) base = AutoModelForCausalLM.from_pretrained(base_id, torch_dtype=torch.bfloat16, device_map="cuda") model = PeftModel.from_pretrained(base, adapter).merge_and_unload() model.eval() model.config.use_cache = True im_end = tok.convert_tokens_to_ids("<|im_end|>") eos_ids = [tok.eos_token_id, im_end] if im_end is not None else tok.eos_token_id def base_planner(df, *_): msgs = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": build_user_prompt(profile_dataframe(df), df)}] enc = tok.apply_chat_template(msgs, add_generation_prompt=True, return_tensors="pt", return_dict=True) ids = enc["input_ids"].to(model.device) with torch.no_grad(): out = model.generate(input_ids=ids, attention_mask=enc["attention_mask"].to(model.device), max_new_tokens=2000, do_sample=False, eos_token_id=eos_ids, pad_token_id=tok.eos_token_id, use_cache=True, suppress_tokens=[151657, 151658]) text = tok.decode(out[0][ids.shape[1]:], skip_special_tokens=True) plan = _extract_json(text) if plan is None: return {"__error__": "no_json"} plan.setdefault("table_operations", []) plan.setdefault("columns", []) plan.setdefault("flags", []) return plan grounded = make_grounded_planner(make_batched_planner(base_planner, batch_size=4)) if union: # the SHIPPED active.py composition (WS1) from scrubdata.planner import mock_plan from scrubdata.verifier import union_plans, verify_plan def planner(df, *_): return union_plans(verify_plan(df, grounded(df), tau=0.5), mock_plan(df)) else: planner = grounded # scoped slice: all REAL + the typo-injected datasets (the canonicalization regime) specs = [s for s in build_suite(seed=seed) if s.get("source") == "real" or s["name"].endswith(":typo")] rows = [] for spec in specs: try: loaded = spec["load"]() except Exception as e: # noqa: BLE001 print(f" {spec['name']}: load failed {type(e).__name__}", flush=True) continue if loaded is None: continue dirty, clean = loaded try: cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty))) m = score(dirty, clean, cleaned) except Exception as e: # noqa: BLE001 print(f" {spec['name']}: eval failed {type(e).__name__}", flush=True) continue rows.append({"name": spec["name"], "source": spec.get("source", "injected"), "f1": m["f1"], "recall": m["recall"], "precision": m["precision"], "damage": m["damage"]}) results["_partial"] = rows # survive a timeout: per-spec checkpoint print(f" {spec['name']:<26} F1={m['f1']:.3f} P={m['precision']:.3f} " f"R={m['recall']:.3f} dmg={m['damage']:.3f}", flush=True) ab = abstain_slice(planner) def mean(xs): xs = list(xs) return sum(xs) / len(xs) if xs else 0.0 summary = { "real_f1": mean(r["f1"] for r in rows if r["source"] == "real"), "injected_typo_f1": mean(r["f1"] for r in rows if r["source"] != "real"), "damage": mean(r["damage"] for r in rows), "abstain_accuracy": ab["abstain_accuracy"], "typo_recall": ab["typo_recall"], "n_datasets": len(rows), "rows": rows, } label = ("union_" if union else "grounded_") + adapter.rsplit("/", 1)[-1] summary["system"] = label print(f"\n{label} on suite:", {k: round(v, 3) for k, v in summary.items() if isinstance(v, float)}) results[label] = summary results["latest"] = summary return summary @app.local_entrypoint() def main(seed: int = 7, adapter: str = "/vol/v5_seed21", union: bool = True): call = run_suite.spawn(seed=seed, adapter=adapter, union=union) print(f"Launched detached. call_id={call.object_id}")