File size: 2,694 Bytes
f28d994 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | """Generate the seed-202 validation labels and verify alignment with cached OOF.
The split is deterministic (pandas.sample(random_state=202) + np.random.default_rng(202)),
identical to what produced every *_oof.npy in the stacking pipeline. We regenerate it from
`train_val_lgcn_ensemble.make_notebook_style_split`, save y + pairs, then run an alignment
assertion: recomputing best-F1 on the FINAL model OOF must reproduce ~0.966874. If it does,
the labels are row-aligned with every other OOF on the same split.
Usage:
python code/figures/gen_val_labels.py --package-root .
"""
from __future__ import annotations
import argparse
import importlib.util
import sys
from pathlib import Path
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_auc_score
def load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
sys.modules[name] = module
spec.loader.exec_module(module)
return module
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[2])
ap.add_argument("--split-seed", type=int, default=202)
ap.add_argument("--train-frac", type=float, default=0.9)
args = ap.parse_args()
root = args.package_root.resolve()
vr = root / "validation_runs" / f"dynamic_seed{args.split_seed}"
vr.mkdir(parents=True, exist_ok=True)
lgcn = load_module("lgcn", root / "code/train_val_lgcn_ensemble.py")
train_refs, val_pairs = lgcn.make_notebook_style_split(root, args.split_seed, args.train_frac)
y = val_pairs["label"].to_numpy(np.int8)
pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
np.save(vr / f"val_labels_seed{args.split_seed}.npy", y)
np.save(vr / f"val_pairs_seed{args.split_seed}.npy", pairs)
print(f"saved y shape={y.shape} positive_ratio={y.mean():.4f} (expect ~0.5)")
# --- alignment check against the final model OOF ---
oof_path = vr / "high_order_graph_stack/rich_rw7_highorder_directed_oof.npy"
oof = np.load(oof_path).astype(np.float64)
assert len(y) == len(oof), f"length mismatch: y={len(y)} oof={len(oof)}"
p, r, _ = precision_recall_curve(y, oof)
f1 = float((2 * p * r / (p + r + 1e-12)).max())
auc = float(roc_auc_score(y, oof))
print(f"final OOF best_f1={f1:.6f} auc={auc:.6f} (expect ~0.966874 / ~0.994918)")
assert abs(f1 - 0.966874) < 5e-4, f"ALIGNMENT FAILED: got {f1}, expected ~0.966874"
print("ALIGNMENT OK -> y is row-aligned with all *_oof.npy on split_seed=202")
if __name__ == "__main__":
main()
|