Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 15 days ago

2.21 kB

	"""Frozen held-out gold eval set (committed to eval/gold.jsonl).

	A FIXED test set so every fine-tune iteration (and generator change) is scored on the
	same examples — v1 vs v2 stay comparable. Regenerate intentionally with `build_gold`.
	"""

	from __future__ import annotations

	import json
	import random
	from pathlib import Path

	import pandas as pd

	from training.generate import make_example

	from . import metrics

	GOLD_PATH = Path(__file__).resolve().parent / "gold.jsonl"


	def build_gold(n: int = 300, seed: int = 4242, path: Path = GOLD_PATH) -> list[dict]:
	rng = random.Random(seed)
	out = []
	while len(out) < n:
	ex = make_example(rng)
	if metrics.recovery(ex["clean_df"], ex["dirty_df"], ex["plan"]) >= 0.999:
	out.append(ex)
	with Path(path).open("w", encoding="utf-8") as f:
	for ex in out:
	clean = ex["clean_df"].where(pd.notna(ex["clean_df"]), None)
	f.write(json.dumps({
	"dirty": ex["dirty_df"].to_dict("records"),
	"clean": clean.to_dict("records"),
	"dirty_cols": list(ex["dirty_df"].columns),
	"clean_cols": list(ex["clean_df"].columns),
	"plan": ex["plan"],
	}, ensure_ascii=False, default=str) + "\n")
	return out


	def load_gold(path: Path = GOLD_PATH) -> list[dict]:
	p = Path(path)
	if not p.exists():
	return build_gold(path=p)
	out = []
	for line in p.read_text(encoding="utf-8").splitlines():
	d = json.loads(line)
	dirty = (pd.DataFrame(d["dirty"])[d["dirty_cols"]] if d["dirty"]
	else pd.DataFrame(columns=d["dirty_cols"]))
	clean = (pd.DataFrame(d["clean"])[d["clean_cols"]] if d["clean"]
	else pd.DataFrame(columns=d["clean_cols"]))
	out.append({"dirty_df": dirty, "clean_df": clean, "plan": d["plan"]})
	return out


	if __name__ == "__main__":
	import argparse
	ap = argparse.ArgumentParser()
	ap.add_argument("--n", type=int, default=300)
	ap.add_argument("--seed", type=int, default=4242)
	args = ap.parse_args()
	g = build_gold(args.n, args.seed)
	print(f"Wrote {len(g)} frozen gold examples to {GOLD_PATH}")