Spaces:

moccaram
/

DataSynthis_ML_JobTask

Sleeping

File size: 1,883 Bytes

8ba081b

"""XGBoost classifier for triple-barrier labels.

Per Jansen Ch.12, gradient-boosted trees are the natural baseline for tabular
financial features and routinely beat LSTMs on these problems. The hyper-
parameters here are conservative (shallow trees, moderate n_estimators) to
avoid overfitting on small per-fold training sets in the purged CV scheme.
"""

from __future__ import annotations

import numpy as np
from xgboost import XGBClassifier


def build_xgb_classifier(random_state: int = 42) -> XGBClassifier:
    """Returns a fresh XGBClassifier for one CV fold.

    Output classes use the XGBoost-internal indexing ``{0, 1, 2}`` for
    ``{-1, 0, +1}`` since XGBoost requires non-negative integer labels. The
    training driver wraps this with an encoder.
    """
    return XGBClassifier(
        objective="multi:softprob",
        num_class=3,
        max_depth=4,
        n_estimators=300,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        eval_metric="mlogloss",
        random_state=random_state,
        n_jobs=-1,
        tree_method="hist",
    )


class XGBTripleBarrier:
    """Thin wrapper that owns the label encoding from ``{-1, 0, 1}`` ↔ ``{0, 1, 2}``."""

    def __init__(self, random_state: int = 42):
        self.model = build_xgb_classifier(random_state=random_state)
        self.classes_ = np.array([-1, 0, 1])

    def fit(self, X, y, sample_weight=None):
        y_enc = np.asarray(y).astype(int) + 1  # {-1, 0, 1} -> {0, 1, 2}
        self.model.fit(X, y_enc, sample_weight=sample_weight)
        return self

    def predict(self, X):
        y_pred_enc = self.model.predict(X)
        return y_pred_enc - 1

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    @property
    def feature_importances_(self):
        return self.model.feature_importances_