"""Compare second-stage learners on the best high-order feature matrix.""" from __future__ import annotations import argparse import importlib.util import sys from pathlib import Path import lightgbm as lgb import numpy as np import pandas as pd import xgboost as xgb from gensim.models import Word2Vec from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier from sklearn.metrics import precision_recall_curve, roc_auc_score from sklearn.model_selection import StratifiedKFold def load_module(name: str, path: Path): spec = importlib.util.spec_from_file_location(name, path) module = importlib.util.module_from_spec(spec) assert spec.loader is not None sys.modules[name] = module spec.loader.exec_module(module) return module def best_f1(y, s): p, r, t = precision_recall_curve(y, s) f = 2 * p * r / (p + r + 1e-12) i = int(np.argmax(f)) return float(f[i]), float(t[i] if i < len(t) else 0.5), float(roc_auc_score(y, s)), float(p[i]), float(r[i]) def fit_oof(X, y, kind: str, seed: int, n_splits: int): oof = np.zeros(len(y), dtype=np.float32) skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) for fold, (tr, va) in enumerate(skf.split(X, y), start=1): if kind == "xgb_depth3": clf = xgb.XGBClassifier( n_estimators=900, learning_rate=0.025, max_depth=3, min_child_weight=20, subsample=0.9, colsample_bytree=0.85, reg_lambda=8.0, objective="binary:logistic", eval_metric="logloss", tree_method="hist", n_jobs=8, random_state=seed + fold, ) elif kind == "xgb_depth4": clf = xgb.XGBClassifier( n_estimators=800, learning_rate=0.025, max_depth=4, min_child_weight=30, subsample=0.9, colsample_bytree=0.85, reg_lambda=10.0, objective="binary:logistic", eval_metric="logloss", tree_method="hist", n_jobs=8, random_state=seed + fold, ) elif kind == "extratrees": clf = ExtraTreesClassifier( n_estimators=500, max_depth=24, min_samples_leaf=10, max_features=0.55, n_jobs=8, random_state=seed + fold, ) else: clf = lgb.LGBMClassifier( n_estimators=1300, learning_rate=0.022, num_leaves=15, subsample=0.9, colsample_bytree=0.9, reg_lambda=8.0, min_child_samples=100, objective="binary", n_jobs=8, verbose=-1, random_state=seed + fold, ) clf.fit(X[tr], y[tr]) oof[va] = clf.predict_proba(X[va])[:, 1].astype(np.float32) return oof def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1]) ap.add_argument("--split-seed", type=int, default=202) ap.add_argument("--seed", type=int, default=202) ap.add_argument("--n-splits", type=int, default=5) args = ap.parse_args() root = args.package_root.resolve() rw = load_module("rw", root / "code/randomwalk_systematic_ablation.py") stack = load_module("stack", root / "code/stack_rank_calibration.py") rich = load_module("rich", root / "code/content_rich_ablation.py") ens = load_module("ens", root / "code/generate_randomwalk_ensemble_submission.py") high = load_module("high", root / "code/high_order_graph_stack.py") main_val = root / "validation_runs/dynamic_seed202/dyn202_l2d512_bpr_bigbatch_more/scores/val_vanilla_ensemble_mean.npy" train_refs, pairs, y, X_base = rw.build_base_features(root, args.split_seed, main_val) builder = stack.ExplicitGraphFeatures(root, train_refs) X_rich = rich.content_rich_features(root, pairs, builder) versions = [ "dw_base_d128_l40_w10_win10", "dw_long_d128_l80_w10_win10", "dw_highdim_d256_l40_w10_win10", "dw_d256_l80_w10_win10", "dw_seed3407_d128_l40_w10_win10", "dw_graph_ap_pp", "n2v_p2_q1_d128_l40_w10_win10", ] cfgs = {c.version_name: c for c in rw.small_configs() + rw.graph_configs() + rw.extra_configs()} sys_dir = root / "validation_runs/dynamic_seed202/randomwalk_systematic" blocks = [] for version in versions: cfg = cfgs[version] model = Word2Vec.load(str(sys_dir / "models" / f"{version}.model")) block, _ = rw.pair_feature_block(model, pairs, cfg, root, args.split_seed, train_refs) blocks.append(block) X_high = high.build_high_order(root, train_refs, pairs, "val202") X_high_dir = high.build_high_order_directed(root, train_refs, pairs, "val202") X = np.column_stack([X_base, X_rich, *blocks, ens.aggregate(blocks), X_high, X_high_dir]).astype(np.float32) print("X", X.shape) out = root / "validation_runs/dynamic_seed202/high_order_model_compare" out.mkdir(parents=True, exist_ok=True) rows = [] oofs = {} for kind in ["lgb15", "xgb_depth3", "xgb_depth4", "extratrees"]: print("fit", kind) oof = fit_oof(X, y, kind, args.seed + len(rows) * 41, args.n_splits) np.save(out / f"{kind}_oof.npy", oof) oofs[kind] = oof f1, th, auc, p, r = best_f1(y, oof) rows.append({"stage": kind, "validation_f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r}) print(rows[-1]) keys = list(oofs) for a in np.linspace(0, 1, 11): if "xgb_depth3" not in oofs: continue s = a * oofs["lgb15"] + (1 - a) * oofs["xgb_depth3"] f1, th, auc, p, r = best_f1(y, s) rows.append({"stage": f"blend_lgb_xgb3_a{a:.1f}", "validation_f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r}) if len(keys) >= 3: s = np.mean([oofs[k] for k in keys if k != "extratrees"], axis=0) f1, th, auc, p, r = best_f1(y, s) rows.append({"stage": "mean_lgb_xgb3_xgb4", "validation_f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r}) pd.DataFrame(rows).sort_values("validation_f1", ascending=False).to_csv(out / "summary.csv", index=False) print(pd.DataFrame(rows).sort_values("validation_f1", ascending=False).to_string(index=False)) if __name__ == "__main__": main()