"""CV-aware training driver — one harness for all five models. The driver expects each model to expose ``fit(X, y, sample_weight=None)``, ``predict(X)``, and (optionally) ``predict_proba(X)``. The triple-barrier label ``{-1, 0, +1}`` is shared across all of them. Sample weights come from AFML Ch.4 — observations whose label intervals overlap contribute less unique information, so they should count less in the loss. The simplest implementation is to weight inversely by the number of overlapping labels (Snippet 4.1); for now the driver supports passing pre-computed weights or falling back to uniform. """ from __future__ import annotations from collections.abc import Callable from typing import Any import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from .cv import PurgedKFold from .eval import fold_metrics def fit_predict_one_fold( model_builder: Callable[[], Any], X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, sample_weight_train: np.ndarray | None = None, standardize: bool = True, ) -> tuple[np.ndarray, Any]: """Fit on the train fold, predict on the test fold. Returns (y_pred, fitted_model).""" if standardize: scaler = StandardScaler().fit(X_train.values) X_train_s = pd.DataFrame( scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns ) X_test_s = pd.DataFrame( scaler.transform(X_test.values), index=X_test.index, columns=X_test.columns ) else: X_train_s, X_test_s = X_train, X_test model = model_builder() model.fit(X_train_s, y_train.values, sample_weight=sample_weight_train) return model.predict(X_test_s), model def run_cv( model_name: str, model_builder: Callable[[], Any], X: pd.DataFrame, y: pd.Series, cv: PurgedKFold, sample_weight: pd.Series | None = None, standardize: bool = True, extra_columns: dict | None = None, ) -> pd.DataFrame: """Run a model across all CV folds. Returns one row per fold.""" rows = [] for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X)): X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] sw_train = sample_weight.iloc[train_idx].values if sample_weight is not None else None y_pred, _ = fit_predict_one_fold( model_builder=model_builder, X_train=X_train, y_train=y_train, X_test=X_test, sample_weight_train=sw_train, standardize=standardize, ) metrics = fold_metrics(y_test.values, y_pred) row = {"model": model_name, "fold": fold_idx, **metrics} if extra_columns: row.update(extra_columns) rows.append(row) return pd.DataFrame(rows) def uniqueness_weights(t1: pd.Series) -> pd.Series: """Approximate AFML Ch.4 sample-uniqueness weights. For each event, count how many other events have overlapping ``[start, t1]`` intervals, and weight inversely. Not the rigorous Snippet 4.1 (which counts overlap proportionally), but the right order of magnitude and much faster. """ weights = pd.Series(1.0, index=t1.index) t1_arr = t1.values start_arr = t1.index.values n = len(t1) for i in range(n): overlap = np.sum((start_arr <= t1_arr[i]) & (t1_arr >= start_arr[i])) weights.iloc[i] = 1.0 / max(overlap, 1) # normalize so the weights sum to n (mean weight = 1) weights *= n / weights.sum() return weights