File size: 2,244 Bytes
f28d994 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | """Validation OOF for a selected random-walk ensemble."""
from __future__ import annotations
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import randomwalk_systematic_ablation as rw
from generate_randomwalk_ensemble_submission import aggregate
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
parser.add_argument("--split-seed", type=int, default=202)
parser.add_argument("--main-val-score-file", type=Path, required=True)
parser.add_argument("--versions", nargs="+", required=True)
parser.add_argument("--seed", type=int, default=202)
parser.add_argument("--n-splits", type=int, default=5)
args = parser.parse_args()
root = args.package_root
sys_dir = root / "validation_runs" / f"dynamic_seed{args.split_seed}" / "randomwalk_systematic"
cfgs = {c.version_name: c for c in rw.small_configs() + rw.graph_configs() + rw.extra_configs()}
train_refs, pairs, y, X_base = rw.build_base_features(root, args.split_seed, args.main_val_score_file)
blocks = []
for version in args.versions:
cfg = cfgs[version]
model = Word2Vec.load(str(sys_dir / "models" / f"{version}.model"))
block, _ = rw.pair_feature_block(model, pairs, cfg, root, args.split_seed, train_refs)
blocks.append(block)
X = np.column_stack([X_base, *blocks, aggregate(blocks)]).astype(np.float32)
print("fit_oof", X.shape)
oof = rw.fit_lgb_oof(X, y, args.seed, args.n_splits)
f1, th, auc, p, r = rw.best_f1(y, oof)
version_name = "rwens_" + "_".join(args.versions)
np.save(sys_dir / f"{version_name}_oof.npy", oof)
row = {
"version_name": version_name,
"versions": ",".join(args.versions),
"validation_F1": f1,
"threshold": th,
"auc": auc,
"precision": p,
"recall": r,
"n_features": X.shape[1],
}
path = sys_dir / f"ensemble_{len(args.versions)}_ablation.csv"
pd.DataFrame([row]).to_csv(path, index=False)
print(pd.DataFrame([row]).to_string(index=False))
print(path)
if __name__ == "__main__":
main()
|