"""W2.d — TOST equivalence statistics for the SFT null (the bounded negative claim). Operationalizes "weight interventions did not move held-out repair": paired per-dataset GEN-F1 deltas (retrain minus champion v6) over the 3 held-out EVAL sources x the 5-retrain SFT series (challenger seed31, v7 seed32, v8 seed33, v9 seed34, v10 seed35), pooled (n=15). DISCLOSED granularity: the retrain series was scored per held-out SOURCE only (eval/results/generalization_*.json) — the 42-pair paired bench exists for the shipped pipeline, not per retrain — so the unit here is per-dataset, not per-pair, and within-retrain deltas are clustered (flights/rayyan deltas are near-identical across retrains). A retrain-level robustness check (n=5 macro deltas, one per retrain) is reported alongside. PRE-REGISTERED (docs/ROADMAP_PUBLICATION.md W2.d, before this analysis ran): SESOI delta = +/-0.05 GEN-F1, justified as smaller than the gain deterministic grounding provides. TOST per Lakens'17: two one-sided t-tests against the SESOI bounds; equivalence p = max of the two. Bootstrap: 10k resamples, seed 42, 90% CI. uv run python -m eval.equivalence Writes eval/results/equivalence.json. """ from __future__ import annotations import json from pathlib import Path import numpy as np from scipy import stats RESULTS = Path(__file__).resolve().parent / "results" SESOI = 0.05 # pre-registered (roadmap W2.d) — do not change post hoc N_BOOT = 10_000 SEED = 42 CHAMPION = "generalization_champion.json" # champion v6/seed21 (union) RETRAINS = [ # the five SFT retrains (paper sec:negative) ("generalization_challenger.json", "challenger seed31"), ("generalization_v7.json", "v7 seed32 (unicode-punct archetype)"), ("generalization_v8.json", "v8 seed33 (+109k harvested alias vocabs)"), ("generalization_v9.json", "v9 seed34 (+MusicBrainz hints, gidcl pairs)"), ("generalization_v10.json", "v10 seed35 (suspects-contract)"), ] def _per_source_f1(fname: str) -> dict[str, float]: rec = json.loads((RESULTS / fname).read_text())[0] return {s["source"]: s["f1"] for s in rec["per_source"]}, rec["gen_f1"] def _tost(deltas: np.ndarray) -> dict: """Two one-sided t-tests against [-SESOI, +SESOI]; equivalence p = max.""" p_lo = stats.ttest_1samp(deltas, -SESOI, alternative="greater").pvalue p_hi = stats.ttest_1samp(deltas, +SESOI, alternative="less").pvalue return {"p_lower": float(p_lo), "p_upper": float(p_hi), "p_tost": float(max(p_lo, p_hi)), "n": int(len(deltas)), "mean": float(deltas.mean()), "sd": float(deltas.std(ddof=1))} def main() -> dict: champ, champ_macro = _per_source_f1(CHAMPION) pooled, per_retrain = [], [] for fname, label in RETRAINS: ps, macro = _per_source_f1(fname) assert set(ps) == set(champ), f"{fname}: source mismatch vs champion" per_retrain.append({ "retrain": label, "file": fname, "macro_gen_f1": round(macro, 6), "macro_delta": round(macro - champ_macro, 6), "per_dataset_delta": {s: round(ps[s] - champ[s], 6) for s in champ}, }) pooled += [ps[s] - champ[s] for s in sorted(champ)] deltas = np.array(pooled) rng = np.random.default_rng(SEED) boot = np.array([rng.choice(deltas, size=len(deltas), replace=True).mean() for _ in range(N_BOOT)]) ci = (float(np.percentile(boot, 5)), float(np.percentile(boot, 95))) macro_deltas = np.array([r["macro_delta"] for r in per_retrain]) out = { "spec": {"sesoi": SESOI, "sesoi_preregistered": "docs/ROADMAP_PUBLICATION.md W2.d", "n_boot": N_BOOT, "seed": SEED, "ci_level": 0.90, "champion": CHAMPION, "champion_macro_gen_f1": round(champ_macro, 6)}, "granularity": ("per-dataset (3 held-out sources x 5 retrains = 15 paired " "deltas). Per-pair rows do not exist for the retrain series " "(only the shipped pipeline was scored on the 42-pair bench); " "within-retrain deltas are clustered, hence the retrain-level " "robustness check below."), "per_retrain": per_retrain, "pooled_per_dataset": { **_tost(deltas), "ci90_bootstrap": [round(ci[0], 6), round(ci[1], 6)], "ci90_width": round(ci[1] - ci[0], 6), "equivalent_at_sesoi": bool(-SESOI < ci[0] and ci[1] < SESOI), }, "retrain_level_robustness": _tost(macro_deltas), "caveat": ("GEN-F1 sits near floor (champion 0.015 absolute), so the bound " "certifies absence of movement on a low-dynamic-range metric; " "the CI width (~0.004) shows the data could have detected effects " "an order of magnitude smaller than the 0.05 SESOI."), } p = out["pooled_per_dataset"] out["paper_sentence"] = ( f"Across the five-retrain series the mean held-out GEN-F1 delta (retrain " f"minus champion, per-dataset, n={p['n']}) is {p['mean']:+.4f} (90\\% " f"bootstrap CI [{ci[0]:+.4f}, {ci[1]:+.4f}]); TOST rejects effects larger " f"than the pre-registered $\\pm$0.05 SESOI (p = {p['p_tost']:.1e}), and the " f"retrain-level check (n=5 macro deltas) agrees " f"(p = {out['retrain_level_robustness']['p_tost']:.1e}).") (RESULTS / "equivalence.json").write_text(json.dumps(out, indent=2) + "\n") print(json.dumps({k: out[k] for k in ("pooled_per_dataset", "retrain_level_robustness", "paper_sentence")}, indent=2)) return out if __name__ == "__main__": main()