File size: 5,207 Bytes
0f1a3b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | # src/models/ensemble.py
#
# Multi-seed OOF ensemble.
# 3 seeds × 3 model types × 5 folds = 45 GBM runs.
# Models: LGBM-RMSE, CatBoost, XGBoost.
# All trained on z-scored targets, predictions inverse-transformed.
import numpy as np
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import joblib
from pathlib import Path
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
class TargetScaler:
"""Z-score normalization on training targets only."""
def fit(self, y):
self.mu = y.mean()
self.std = y.std()
return self
def transform(self, y): return (y - self.mu) / self.std
def inverse(self, y): return y * self.std + self.mu
def _lgbm_rmse(seed, lr, n_trees):
return lgb.LGBMRegressor(
objective='regression', num_leaves=63, max_depth=7,
learning_rate=lr, n_estimators=n_trees,
min_child_samples=25, subsample=0.75, colsample_bytree=0.75,
reg_alpha=0.2, reg_lambda=2.0,
random_state=seed, n_jobs=4, verbose=-1,
)
def _catboost(seed, lr, n_trees, early_stop):
return cb.CatBoostRegressor(
depth=7, learning_rate=lr, iterations=n_trees,
l2_leaf_reg=5.0, subsample=0.75, min_data_in_leaf=25,
loss_function='RMSE', eval_metric='RMSE',
early_stopping_rounds=early_stop,
random_seed=seed, verbose=0, task_type='CPU',
)
def _xgboost(seed, lr, n_trees, early_stop):
return xgb.XGBRegressor(
n_estimators=n_trees, max_depth=6, learning_rate=lr,
subsample=0.75, colsample_bytree=0.75,
reg_alpha=0.2, reg_lambda=2.0, min_child_weight=6,
early_stopping_rounds=early_stop, eval_metric='rmse',
random_state=seed, n_jobs=4, verbosity=0,
)
def _fit(model, Xtr, ytr, Xval, yval, early_stop):
if isinstance(model, lgb.LGBMRegressor):
model.fit(Xtr, ytr, eval_set=[(Xval, yval)],
callbacks=[lgb.early_stopping(early_stop, verbose=False),
lgb.log_evaluation(-1)])
elif isinstance(model, cb.CatBoostRegressor):
model.fit(Xtr, ytr, eval_set=(Xval, yval), use_best_model=True)
else:
model.fit(Xtr, ytr, eval_set=[(Xval, yval)], verbose=False)
return model
def run_oof(X_train, y_train_raw, X_test,
seeds, n_folds, lr, n_trees, early_stop,
models_dir: Path = None) -> tuple:
"""
Multi-seed OOF stacking.
3 model types: LGBM, CatBoost, XGBoost.
Args:
models_dir: if provided, saves each fold model as
fold_model_s{seed}_{type}_f{fold}.pkl
Required for CASF-2013 zero-shot evaluation.
Returns:
oof_matrix [N_train, n_seeds * 3]
test_matrix [N_test, n_seeds * 3]
scaler fitted TargetScaler
"""
scaler = TargetScaler().fit(y_train_raw)
y_train = scaler.transform(y_train_raw)
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
n_cols = len(seeds) * 3
oof_mat = np.zeros((len(X_train), n_cols))
test_mat = np.zeros((len(X_test), n_cols))
for si, seed in enumerate(seeds):
print(f"\n Seed {seed} ({si+1}/{len(seeds)})")
oof_lr = np.zeros(len(X_train))
oof_cb = np.zeros(len(X_train))
oof_xb = np.zeros(len(X_train))
t_lr = np.zeros((len(X_test), n_folds))
t_cb = np.zeros((len(X_test), n_folds))
t_xb = np.zeros((len(X_test), n_folds))
for fold, (tri, vali) in enumerate(kf.split(X_train)):
Xtr, Xval = X_train[tri], X_train[vali]
ytr, yval = y_train[tri], y_train[vali]
mlr = _fit(_lgbm_rmse(seed, lr, n_trees), Xtr, ytr, Xval, yval, early_stop)
mcb = _fit(_catboost(seed, lr, n_trees, early_stop), Xtr, ytr, Xval, yval, early_stop)
mxb = _fit(_xgboost(seed, lr, n_trees, early_stop), Xtr, ytr, Xval, yval, early_stop)
# Save fold models for zero-shot evaluation on new test sets
if models_dir is not None:
models_dir = Path(models_dir)
models_dir.mkdir(exist_ok=True)
joblib.dump(mlr, models_dir / f"fold_model_s{seed}_lgbm_f{fold}.pkl")
joblib.dump(mcb, models_dir / f"fold_model_s{seed}_cb_f{fold}.pkl")
joblib.dump(mxb, models_dir / f"fold_model_s{seed}_xgb_f{fold}.pkl")
oof_lr[vali] = mlr.predict(Xval)
oof_cb[vali] = mcb.predict(Xval)
oof_xb[vali] = mxb.predict(Xval)
t_lr[:, fold] = mlr.predict(X_test)
t_cb[:, fold] = mcb.predict(X_test)
t_xb[:, fold] = mxb.predict(X_test)
base = si * 3
oof_mat[:, base+0] = scaler.inverse(oof_lr)
oof_mat[:, base+1] = scaler.inverse(oof_cb)
oof_mat[:, base+2] = scaler.inverse(oof_xb)
test_mat[:, base+0] = scaler.inverse(t_lr.mean(1))
test_mat[:, base+1] = scaler.inverse(t_cb.mean(1))
test_mat[:, base+2] = scaler.inverse(t_xb.mean(1))
p = pearsonr(oof_mat[:, base:base+3].mean(1), y_train_raw)[0]
print(f" OOF Pearson (seed {seed}): {p:.4f}")
return oof_mat, test_mat, scaler
|