Spaces:

build-small-hackathon
/

scrubdata

Running

scrubdata / eval /equivalence.py

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 17 days ago

5.81 kB

	"""W2.d — TOST equivalence statistics for the SFT null (the bounded negative claim).

	Operationalizes "weight interventions did not move held-out repair": paired
	per-dataset GEN-F1 deltas (retrain minus champion v6) over the 3 held-out EVAL
	sources x the 5-retrain SFT series (challenger seed31, v7 seed32, v8 seed33,
	v9 seed34, v10 seed35), pooled (n=15). DISCLOSED granularity: the retrain series
	was scored per held-out SOURCE only (eval/results/generalization_*.json) — the
	42-pair paired bench exists for the shipped pipeline, not per retrain — so the
	unit here is per-dataset, not per-pair, and within-retrain deltas are clustered
	(flights/rayyan deltas are near-identical across retrains). A retrain-level
	robustness check (n=5 macro deltas, one per retrain) is reported alongside.

	PRE-REGISTERED (docs/ROADMAP_PUBLICATION.md W2.d, before this analysis ran):
	SESOI delta = +/-0.05 GEN-F1, justified as smaller than the gain deterministic
	grounding provides. TOST per Lakens'17: two one-sided t-tests against the SESOI
	bounds; equivalence p = max of the two. Bootstrap: 10k resamples, seed 42, 90% CI.

	uv run python -m eval.equivalence
	Writes eval/results/equivalence.json.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path

	import numpy as np
	from scipy import stats

	RESULTS = Path(__file__).resolve().parent / "results"
	SESOI = 0.05 # pre-registered (roadmap W2.d) — do not change post hoc
	N_BOOT = 10_000
	SEED = 42

	CHAMPION = "generalization_champion.json" # champion v6/seed21 (union)
	RETRAINS = [ # the five SFT retrains (paper sec:negative)
	("generalization_challenger.json", "challenger seed31"),
	("generalization_v7.json", "v7 seed32 (unicode-punct archetype)"),
	("generalization_v8.json", "v8 seed33 (+109k harvested alias vocabs)"),
	("generalization_v9.json", "v9 seed34 (+MusicBrainz hints, gidcl pairs)"),
	("generalization_v10.json", "v10 seed35 (suspects-contract)"),
	]


	def _per_source_f1(fname: str) -> dict[str, float]:
	rec = json.loads((RESULTS / fname).read_text())[0]
	return {s["source"]: s["f1"] for s in rec["per_source"]}, rec["gen_f1"]


	def _tost(deltas: np.ndarray) -> dict:
	"""Two one-sided t-tests against [-SESOI, +SESOI]; equivalence p = max."""
	p_lo = stats.ttest_1samp(deltas, -SESOI, alternative="greater").pvalue
	p_hi = stats.ttest_1samp(deltas, +SESOI, alternative="less").pvalue
	return {"p_lower": float(p_lo), "p_upper": float(p_hi),
	"p_tost": float(max(p_lo, p_hi)), "n": int(len(deltas)),
	"mean": float(deltas.mean()), "sd": float(deltas.std(ddof=1))}


	def main() -> dict:
	champ, champ_macro = _per_source_f1(CHAMPION)
	pooled, per_retrain = [], []
	for fname, label in RETRAINS:
	ps, macro = _per_source_f1(fname)
	assert set(ps) == set(champ), f"{fname}: source mismatch vs champion"
	per_retrain.append({
	"retrain": label, "file": fname,
	"macro_gen_f1": round(macro, 6),
	"macro_delta": round(macro - champ_macro, 6),
	"per_dataset_delta": {s: round(ps[s] - champ[s], 6) for s in champ},
	})
	pooled += [ps[s] - champ[s] for s in sorted(champ)]
	deltas = np.array(pooled)

	rng = np.random.default_rng(SEED)
	boot = np.array([rng.choice(deltas, size=len(deltas), replace=True).mean()
	for _ in range(N_BOOT)])
	ci = (float(np.percentile(boot, 5)), float(np.percentile(boot, 95)))

	macro_deltas = np.array([r["macro_delta"] for r in per_retrain])
	out = {
	"spec": {"sesoi": SESOI, "sesoi_preregistered": "docs/ROADMAP_PUBLICATION.md W2.d",
	"n_boot": N_BOOT, "seed": SEED, "ci_level": 0.90,
	"champion": CHAMPION, "champion_macro_gen_f1": round(champ_macro, 6)},
	"granularity": ("per-dataset (3 held-out sources x 5 retrains = 15 paired "
	"deltas). Per-pair rows do not exist for the retrain series "
	"(only the shipped pipeline was scored on the 42-pair bench); "
	"within-retrain deltas are clustered, hence the retrain-level "
	"robustness check below."),
	"per_retrain": per_retrain,
	"pooled_per_dataset": {
	**_tost(deltas),
	"ci90_bootstrap": [round(ci[0], 6), round(ci[1], 6)],
	"ci90_width": round(ci[1] - ci[0], 6),
	"equivalent_at_sesoi": bool(-SESOI < ci[0] and ci[1] < SESOI),
	},
	"retrain_level_robustness": _tost(macro_deltas),
	"caveat": ("GEN-F1 sits near floor (champion 0.015 absolute), so the bound "
	"certifies absence of movement on a low-dynamic-range metric; "
	"the CI width (~0.004) shows the data could have detected effects "
	"an order of magnitude smaller than the 0.05 SESOI."),
	}
	p = out["pooled_per_dataset"]
	out["paper_sentence"] = (
	f"Across the five-retrain series the mean held-out GEN-F1 delta (retrain "
	f"minus champion, per-dataset, n={p['n']}) is {p['mean']:+.4f} (90\\% "
	f"bootstrap CI [{ci[0]:+.4f}, {ci[1]:+.4f}]); TOST rejects effects larger "
	f"than the pre-registered $\\pm$0.05 SESOI (p = {p['p_tost']:.1e}), and the "
	f"retrain-level check (n=5 macro deltas) agrees "
	f"(p = {out['retrain_level_robustness']['p_tost']:.1e}).")

	(RESULTS / "equivalence.json").write_text(json.dumps(out, indent=2) + "\n")
	print(json.dumps({k: out[k] for k in ("pooled_per_dataset",
	"retrain_level_robustness",
	"paper_sentence")}, indent=2))
	return out


	if __name__ == "__main__":
	main()