Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 11 days ago

7.48 kB

	"""Fast standalone eval of the v5 adapter saved in the Modal Volume.

	The in-training eval was slow because it ran the 4-bit (bitsandbytes) training model.
	Here we load the base in BF16 + the LoRA adapter from the volume and merge -> fast
	generation. Reports synthetic gold + the real Raha hospital repair_recall (the headline).

	uv run modal run scripts/modal_eval_v5.py
	"""

	import modal

	IGNORE = [".venv/", ".git/", ".gguf", "/__pycache__/", ".gstack/*",
	"design/*", "frontend/variant_/", "notebooks/", ".pytest_cache/", "data/"]

	image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install("torch", "transformers>=4.45", "peft", "accelerate",
	"pandas", "jsonschema", "pycountry", "sentencepiece")
	.add_local_dir(".", "/root/repo", ignore=IGNORE, copy=True)
	# harvested EVAL-ONLY pairs (data/** is ignored above; Raha sets auto-download
	# in-container, these only exist locally via the stage-2/3 harvesters)
	.add_local_dir("data/real/ed2_restaurants",
	"/root/repo/data/real/ed2_restaurants", copy=True)
	.add_local_dir("data/real/tt_co23z7go", "/root/repo/data/real/tt_co23z7go", copy=True)
	.add_local_dir("data/real/tt_uma1dnf6", "/root/repo/data/real/tt_uma1dnf6", copy=True)
	.add_local_dir("data/real/zeroed_billionaire",
	"/root/repo/data/real/zeroed_billionaire", copy=True)
	# entity-reference vocabularies (reconcile.default_index loads from data/)
	.add_local_file("training/harvests/toughtables_ref.jsonl",
	"/root/repo/data/toughtables_ref.jsonl", copy=True)
	.add_local_file("training/harvests/musicbrainz_hint_aliases.jsonl",
	"/root/repo/data/musicbrainz_hint_aliases.jsonl", copy=True)
	.add_local_file("training/harvests/wikidata_company_aliases.jsonl",
	"/root/repo/data/wikidata_company_aliases.jsonl", copy=True)
	.add_local_file("training/harvests/ror_aliases.jsonl",
	"/root/repo/data/ror_aliases.jsonl", copy=True)
	)
	app = modal.App("scrubdata-eval-v5", image=image)
	adapter_vol = modal.Volume.from_name("scrubdata-v5-adapter")
	results = modal.Dict.from_name("scrubdata-eval-v5-results", create_if_missing=True)


	@app.function(gpu="A100-80GB", timeout=7200, volumes={"/vol": adapter_vol})
	def run_eval(n_synth: int = 20, adapter: str = "/vol/v5", skip_real: bool = False,
	pair_profiles: bool = False, capture: str = ""):
	import os, sys, torch
	os.chdir("/root/repo")
	sys.path.insert(0, "/root/repo")
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
	from scrubdata.profiler import profile_dataframe
	from scrubdata.model_planner import _extract_json, make_batched_planner
	from scrubdata.executor import apply_plan
	from scrubdata.planner import mock_plan
	from eval.run_eval import evaluate
	from eval.gold import load_gold
	from eval.run_real import _ensure_data, _load, _score

	base_id = "unsloth/Qwen3-4B-Instruct-2507"
	tok = AutoTokenizer.from_pretrained(base_id)
	base = AutoModelForCausalLM.from_pretrained(base_id, torch_dtype=torch.bfloat16, device_map="cuda")
	model = PeftModel.from_pretrained(base, adapter).merge_and_unload() # bf16-native merge
	model.eval()
	model.config.use_cache = True

	im_end = tok.convert_tokens_to_ids("<\|im_end\|>")
	eos_ids = [tok.eos_token_id, im_end] if im_end is not None else tok.eos_token_id

	def base_planner(df, *_):
	pairs = None
	if pair_profiles:
	from scrubdata.pair_profile import pairs_for_df
	pairs = pairs_for_df(df)
	msgs = [{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user",
	"content": build_user_prompt(profile_dataframe(df), df,
	candidate_pairs=pairs)}]
	enc = tok.apply_chat_template(msgs, add_generation_prompt=True,
	return_tensors="pt", return_dict=True)
	ids = enc["input_ids"].to(model.device)
	with torch.no_grad():
	out = model.generate(input_ids=ids, attention_mask=enc["attention_mask"].to(model.device),
	max_new_tokens=2200, do_sample=False, eos_token_id=eos_ids,
	pad_token_id=tok.eos_token_id, use_cache=True,
	suppress_tokens=[151657, 151658]) # block <tool_call> loop
	text = tok.decode(out[0][ids.shape[1]:], skip_special_tokens=True)
	plan = _extract_json(text)
	if plan is None:
	return {"__error__": "no_json"}
	plan.setdefault("table_operations", [])
	plan.setdefault("columns", [])
	plan.setdefault("flags", [])
	if pairs is not None:
	from scrubdata.pair_profile import constrain_plan
	plan = constrain_plan(plan, pairs)
	return plan

	out = {}
	gold = load_gold()[:n_synth]
	out["layer1"] = {name: evaluate(fn, gold) for name, fn in {
	"HEURISTIC": lambda df, gp: mock_plan(df), "FT_v5": base_planner}.items()}
	if not skip_real:
	_ensure_data()
	dirty, clean = _load()
	ft_plan = make_batched_planner(base_planner, batch_size=4)(dirty)
	cleaned, _ = apply_plan(dirty, ft_plan)
	out["hospital_ft"] = _score(dirty, clean, cleaned)
	out["hospital_noop"] = _score(dirty, clean, dirty)
	out["hospital_plan"] = ft_plan # raw plan for local precision-curve sweeps

	if capture:
	# capture raw grounded model plans for arbitrary eval datasets (GEN metric:
	# plans are applied + scored locally with the full union pipeline). Tables
	# are loaded FULL (same loader contract as eval/generalization.py).
	from eval.run_real_multi import _fetch
	out["plans"] = {}
	for name in capture.split(","):
	dirty, _clean = _fetch(name)
	print(f"capturing plan: {name} ({len(dirty)} rows)", flush=True)
	out["plans"][name] = make_batched_planner(base_planner, batch_size=4)(dirty)

	table = _format(out)
	print(table)
	key = adapter.rsplit("/", 1)[-1] if adapter != "/vol/v5" else "latest"
	if pair_profiles:
	key += "_pairs"
	results[key] = {"out": out, "table": table}
	return out


	def _format(r) -> str:
	L = ["\n=== Layer 1 (synthetic) ==="]
	cols = ["json_valid", "op_f1", "canon_f1", "recovery"]
	L.append(f"{'system':<12}" + "".join(f"{c:>11}" for c in cols))
	for name, m in r["layer1"].items():
	L.append(f"{name:<12}" + "".join(f"{m[c]:>11.3f}" for c in cols))
	if "hospital_ft" not in r:
	return "\n".join(L)
	L.append("\n=== Real hospital ===")
	for k in ("hospital_noop", "hospital_ft"):
	m = r[k]
	L.append(f"{k:<13} repair_recall={m['repair_recall']:.3f} "
	f"repair_prec={m['repair_prec']:.3f} recovery={m['recovery']:.3f}")
	return "\n".join(L)


	@app.local_entrypoint()
	def main(adapter: str = "/vol/v5", skip_real: bool = False, n_synth: int = 20,
	pair_profiles: bool = False, capture: str = ""):
	call = run_eval.spawn(adapter=adapter, skip_real=skip_real, n_synth=n_synth,
	pair_profiles=pair_profiles, capture=capture)
	print(f"Launched detached. call_id={call.object_id}")