File size: 5,207 Bytes
0f1a3b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# src/models/ensemble.py
#
# Multi-seed OOF ensemble.
# 3 seeds × 3 model types × 5 folds = 45 GBM runs.
# Models: LGBM-RMSE, CatBoost, XGBoost.
# All trained on z-scored targets, predictions inverse-transformed.

import numpy as np
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import joblib
from pathlib import Path
from scipy.stats import pearsonr
from sklearn.model_selection import KFold


class TargetScaler:
    """Z-score normalization on training targets only."""
    def fit(self, y):
        self.mu  = y.mean()
        self.std = y.std()
        return self
    def transform(self, y):   return (y - self.mu) / self.std
    def inverse(self, y):     return y * self.std + self.mu


def _lgbm_rmse(seed, lr, n_trees):
    return lgb.LGBMRegressor(
        objective='regression', num_leaves=63, max_depth=7,
        learning_rate=lr, n_estimators=n_trees,
        min_child_samples=25, subsample=0.75, colsample_bytree=0.75,
        reg_alpha=0.2, reg_lambda=2.0,
        random_state=seed, n_jobs=4, verbose=-1,
    )

def _catboost(seed, lr, n_trees, early_stop):
    return cb.CatBoostRegressor(
        depth=7, learning_rate=lr, iterations=n_trees,
        l2_leaf_reg=5.0, subsample=0.75, min_data_in_leaf=25,
        loss_function='RMSE', eval_metric='RMSE',
        early_stopping_rounds=early_stop,
        random_seed=seed, verbose=0, task_type='CPU',
    )

def _xgboost(seed, lr, n_trees, early_stop):
    return xgb.XGBRegressor(
        n_estimators=n_trees, max_depth=6, learning_rate=lr,
        subsample=0.75, colsample_bytree=0.75,
        reg_alpha=0.2, reg_lambda=2.0, min_child_weight=6,
        early_stopping_rounds=early_stop, eval_metric='rmse',
        random_state=seed, n_jobs=4, verbosity=0,
    )


def _fit(model, Xtr, ytr, Xval, yval, early_stop):
    if isinstance(model, lgb.LGBMRegressor):
        model.fit(Xtr, ytr, eval_set=[(Xval, yval)],
                  callbacks=[lgb.early_stopping(early_stop, verbose=False),
                              lgb.log_evaluation(-1)])
    elif isinstance(model, cb.CatBoostRegressor):
        model.fit(Xtr, ytr, eval_set=(Xval, yval), use_best_model=True)
    else:
        model.fit(Xtr, ytr, eval_set=[(Xval, yval)], verbose=False)
    return model


def run_oof(X_train, y_train_raw, X_test,
            seeds, n_folds, lr, n_trees, early_stop,
            models_dir: Path = None) -> tuple:
    """
    Multi-seed OOF stacking.
    3 model types: LGBM, CatBoost, XGBoost.

    Args:
        models_dir: if provided, saves each fold model as
                    fold_model_s{seed}_{type}_f{fold}.pkl
                    Required for CASF-2013 zero-shot evaluation.

    Returns:
        oof_matrix   [N_train, n_seeds * 3]
        test_matrix  [N_test,  n_seeds * 3]
        scaler       fitted TargetScaler
    """
    scaler  = TargetScaler().fit(y_train_raw)
    y_train = scaler.transform(y_train_raw)
    kf      = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    n_cols   = len(seeds) * 3
    oof_mat  = np.zeros((len(X_train), n_cols))
    test_mat = np.zeros((len(X_test),  n_cols))

    for si, seed in enumerate(seeds):
        print(f"\n  Seed {seed}  ({si+1}/{len(seeds)})")

        oof_lr = np.zeros(len(X_train))
        oof_cb = np.zeros(len(X_train))
        oof_xb = np.zeros(len(X_train))

        t_lr = np.zeros((len(X_test), n_folds))
        t_cb = np.zeros((len(X_test), n_folds))
        t_xb = np.zeros((len(X_test), n_folds))

        for fold, (tri, vali) in enumerate(kf.split(X_train)):
            Xtr, Xval = X_train[tri], X_train[vali]
            ytr, yval = y_train[tri], y_train[vali]

            mlr = _fit(_lgbm_rmse(seed, lr, n_trees),           Xtr, ytr, Xval, yval, early_stop)
            mcb = _fit(_catboost(seed, lr, n_trees, early_stop), Xtr, ytr, Xval, yval, early_stop)
            mxb = _fit(_xgboost(seed, lr, n_trees, early_stop), Xtr, ytr, Xval, yval, early_stop)

            # Save fold models for zero-shot evaluation on new test sets
            if models_dir is not None:
                models_dir = Path(models_dir)
                models_dir.mkdir(exist_ok=True)
                joblib.dump(mlr, models_dir / f"fold_model_s{seed}_lgbm_f{fold}.pkl")
                joblib.dump(mcb, models_dir / f"fold_model_s{seed}_cb_f{fold}.pkl")
                joblib.dump(mxb, models_dir / f"fold_model_s{seed}_xgb_f{fold}.pkl")

            oof_lr[vali] = mlr.predict(Xval)
            oof_cb[vali] = mcb.predict(Xval)
            oof_xb[vali] = mxb.predict(Xval)

            t_lr[:, fold] = mlr.predict(X_test)
            t_cb[:, fold] = mcb.predict(X_test)
            t_xb[:, fold] = mxb.predict(X_test)

        base = si * 3
        oof_mat[:, base+0] = scaler.inverse(oof_lr)
        oof_mat[:, base+1] = scaler.inverse(oof_cb)
        oof_mat[:, base+2] = scaler.inverse(oof_xb)

        test_mat[:, base+0] = scaler.inverse(t_lr.mean(1))
        test_mat[:, base+1] = scaler.inverse(t_cb.mean(1))
        test_mat[:, base+2] = scaler.inverse(t_xb.mean(1))

        p = pearsonr(oof_mat[:, base:base+3].mean(1), y_train_raw)[0]
        print(f"    OOF Pearson (seed {seed}): {p:.4f}")

    return oof_mat, test_mat, scaler