File size: 2,694 Bytes
f28d994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""Generate the seed-202 validation labels and verify alignment with cached OOF.

The split is deterministic (pandas.sample(random_state=202) + np.random.default_rng(202)),
identical to what produced every *_oof.npy in the stacking pipeline. We regenerate it from
`train_val_lgcn_ensemble.make_notebook_style_split`, save y + pairs, then run an alignment
assertion: recomputing best-F1 on the FINAL model OOF must reproduce ~0.966874. If it does,
the labels are row-aligned with every other OOF on the same split.

Usage:
    python code/figures/gen_val_labels.py --package-root .
"""
from __future__ import annotations

import argparse
import importlib.util
import sys
from pathlib import Path

import numpy as np
from sklearn.metrics import precision_recall_curve, roc_auc_score


def load_module(name: str, path: Path):
    spec = importlib.util.spec_from_file_location(name, path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    sys.modules[name] = module
    spec.loader.exec_module(module)
    return module


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[2])
    ap.add_argument("--split-seed", type=int, default=202)
    ap.add_argument("--train-frac", type=float, default=0.9)
    args = ap.parse_args()

    root = args.package_root.resolve()
    vr = root / "validation_runs" / f"dynamic_seed{args.split_seed}"
    vr.mkdir(parents=True, exist_ok=True)

    lgcn = load_module("lgcn", root / "code/train_val_lgcn_ensemble.py")
    train_refs, val_pairs = lgcn.make_notebook_style_split(root, args.split_seed, args.train_frac)

    y = val_pairs["label"].to_numpy(np.int8)
    pairs = val_pairs[["source", "target"]].to_numpy(np.int64)
    np.save(vr / f"val_labels_seed{args.split_seed}.npy", y)
    np.save(vr / f"val_pairs_seed{args.split_seed}.npy", pairs)
    print(f"saved y shape={y.shape} positive_ratio={y.mean():.4f} (expect ~0.5)")

    # --- alignment check against the final model OOF ---
    oof_path = vr / "high_order_graph_stack/rich_rw7_highorder_directed_oof.npy"
    oof = np.load(oof_path).astype(np.float64)
    assert len(y) == len(oof), f"length mismatch: y={len(y)} oof={len(oof)}"
    p, r, _ = precision_recall_curve(y, oof)
    f1 = float((2 * p * r / (p + r + 1e-12)).max())
    auc = float(roc_auc_score(y, oof))
    print(f"final OOF  best_f1={f1:.6f}  auc={auc:.6f}  (expect ~0.966874 / ~0.994918)")
    assert abs(f1 - 0.966874) < 5e-4, f"ALIGNMENT FAILED: got {f1}, expected ~0.966874"
    print("ALIGNMENT OK -> y is row-aligned with all *_oof.npy on split_seed=202")


if __name__ == "__main__":
    main()