cs3319-project2 / code /high_order_model_compare.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
6.71 kB
"""Compare second-stage learners on the best high-order feature matrix."""
from __future__ import annotations
import argparse
import importlib.util
import sys
from pathlib import Path
import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from gensim.models import Word2Vec
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
def load_module(name: str, path: Path):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
sys.modules[name] = module
spec.loader.exec_module(module)
return module
def best_f1(y, s):
p, r, t = precision_recall_curve(y, s)
f = 2 * p * r / (p + r + 1e-12)
i = int(np.argmax(f))
return float(f[i]), float(t[i] if i < len(t) else 0.5), float(roc_auc_score(y, s)), float(p[i]), float(r[i])
def fit_oof(X, y, kind: str, seed: int, n_splits: int):
oof = np.zeros(len(y), dtype=np.float32)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
if kind == "xgb_depth3":
clf = xgb.XGBClassifier(
n_estimators=900,
learning_rate=0.025,
max_depth=3,
min_child_weight=20,
subsample=0.9,
colsample_bytree=0.85,
reg_lambda=8.0,
objective="binary:logistic",
eval_metric="logloss",
tree_method="hist",
n_jobs=8,
random_state=seed + fold,
)
elif kind == "xgb_depth4":
clf = xgb.XGBClassifier(
n_estimators=800,
learning_rate=0.025,
max_depth=4,
min_child_weight=30,
subsample=0.9,
colsample_bytree=0.85,
reg_lambda=10.0,
objective="binary:logistic",
eval_metric="logloss",
tree_method="hist",
n_jobs=8,
random_state=seed + fold,
)
elif kind == "extratrees":
clf = ExtraTreesClassifier(
n_estimators=500,
max_depth=24,
min_samples_leaf=10,
max_features=0.55,
n_jobs=8,
random_state=seed + fold,
)
else:
clf = lgb.LGBMClassifier(
n_estimators=1300,
learning_rate=0.022,
num_leaves=15,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=8.0,
min_child_samples=100,
objective="binary",
n_jobs=8,
verbose=-1,
random_state=seed + fold,
)
clf.fit(X[tr], y[tr])
oof[va] = clf.predict_proba(X[va])[:, 1].astype(np.float32)
return oof
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
ap.add_argument("--split-seed", type=int, default=202)
ap.add_argument("--seed", type=int, default=202)
ap.add_argument("--n-splits", type=int, default=5)
args = ap.parse_args()
root = args.package_root.resolve()
rw = load_module("rw", root / "code/randomwalk_systematic_ablation.py")
stack = load_module("stack", root / "code/stack_rank_calibration.py")
rich = load_module("rich", root / "code/content_rich_ablation.py")
ens = load_module("ens", root / "code/generate_randomwalk_ensemble_submission.py")
high = load_module("high", root / "code/high_order_graph_stack.py")
main_val = root / "validation_runs/dynamic_seed202/dyn202_l2d512_bpr_bigbatch_more/scores/val_vanilla_ensemble_mean.npy"
train_refs, pairs, y, X_base = rw.build_base_features(root, args.split_seed, main_val)
builder = stack.ExplicitGraphFeatures(root, train_refs)
X_rich = rich.content_rich_features(root, pairs, builder)
versions = [
"dw_base_d128_l40_w10_win10",
"dw_long_d128_l80_w10_win10",
"dw_highdim_d256_l40_w10_win10",
"dw_d256_l80_w10_win10",
"dw_seed3407_d128_l40_w10_win10",
"dw_graph_ap_pp",
"n2v_p2_q1_d128_l40_w10_win10",
]
cfgs = {c.version_name: c for c in rw.small_configs() + rw.graph_configs() + rw.extra_configs()}
sys_dir = root / "validation_runs/dynamic_seed202/randomwalk_systematic"
blocks = []
for version in versions:
cfg = cfgs[version]
model = Word2Vec.load(str(sys_dir / "models" / f"{version}.model"))
block, _ = rw.pair_feature_block(model, pairs, cfg, root, args.split_seed, train_refs)
blocks.append(block)
X_high = high.build_high_order(root, train_refs, pairs, "val202")
X_high_dir = high.build_high_order_directed(root, train_refs, pairs, "val202")
X = np.column_stack([X_base, X_rich, *blocks, ens.aggregate(blocks), X_high, X_high_dir]).astype(np.float32)
print("X", X.shape)
out = root / "validation_runs/dynamic_seed202/high_order_model_compare"
out.mkdir(parents=True, exist_ok=True)
rows = []
oofs = {}
for kind in ["lgb15", "xgb_depth3", "xgb_depth4", "extratrees"]:
print("fit", kind)
oof = fit_oof(X, y, kind, args.seed + len(rows) * 41, args.n_splits)
np.save(out / f"{kind}_oof.npy", oof)
oofs[kind] = oof
f1, th, auc, p, r = best_f1(y, oof)
rows.append({"stage": kind, "validation_f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r})
print(rows[-1])
keys = list(oofs)
for a in np.linspace(0, 1, 11):
if "xgb_depth3" not in oofs:
continue
s = a * oofs["lgb15"] + (1 - a) * oofs["xgb_depth3"]
f1, th, auc, p, r = best_f1(y, s)
rows.append({"stage": f"blend_lgb_xgb3_a{a:.1f}", "validation_f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r})
if len(keys) >= 3:
s = np.mean([oofs[k] for k in keys if k != "extratrees"], axis=0)
f1, th, auc, p, r = best_f1(y, s)
rows.append({"stage": "mean_lgb_xgb3_xgb4", "validation_f1": f1, "threshold": th, "auc": auc, "precision": p, "recall": r})
pd.DataFrame(rows).sort_values("validation_f1", ascending=False).to_csv(out / "summary.csv", index=False)
print(pd.DataFrame(rows).sort_values("validation_f1", ascending=False).to_string(index=False))
if __name__ == "__main__":
main()