""" Stratified K-fold cross-validation for the stable production pipeline. """ from __future__ import annotations from typing import Any, Callable import numpy as np import pandas as pd from sklearn.metrics import f1_score, roc_auc_score from sklearn.model_selection import StratifiedKFold from src.utils.logger import get_logger logger = get_logger(__name__) def stratified_kfold_cv( X: pd.Series, y: pd.Series, *, n_splits: int, random_state: int, fit_predict_fn: Callable[[pd.Series, pd.Series, pd.Series, pd.Series], dict[str, float]], ) -> dict[str, Any]: """ Run stratified K-fold CV with a caller-provided fit/eval hook. ``fit_predict_fn`` receives (X_tr, y_tr, X_val, y_val) and returns per-fold metrics including at least ``f1_weighted``. """ skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) fold_metrics: list[dict[str, float]] = [] for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)): X_tr = X.iloc[tr_idx].reset_index(drop=True) y_tr = y.iloc[tr_idx].reset_index(drop=True) X_val = X.iloc[val_idx].reset_index(drop=True) y_val = y.iloc[val_idx].reset_index(drop=True) m = fit_predict_fn(X_tr, y_tr, X_val, y_val) m["fold"] = fold fold_metrics.append(m) logger.info( f"CV fold {fold + 1}/{n_splits} — F1={m['f1_weighted']:.4f} " f"gap={m.get('train_val_gap', 0):.4f}" ) f1s = [m["f1_weighted"] for m in fold_metrics] gaps = [m.get("train_val_gap", 0.0) for m in fold_metrics] rocs = [m.get("roc_auc", np.nan) for m in fold_metrics] summary = { "n_splits": n_splits, "f1_mean": round(float(np.mean(f1s)), 4), "f1_std": round(float(np.std(f1s)), 4), "f1_min": round(float(np.min(f1s)), 4), "f1_max": round(float(np.max(f1s)), 4), "gap_mean": round(float(np.mean(gaps)), 4), "gap_std": round(float(np.std(gaps)), 4), "gap_max": round(float(np.max(gaps)), 4), "roc_auc_mean": round(float(np.nanmean(rocs)), 4), "folds": fold_metrics, } stable = summary["f1_std"] < 0.05 and summary["gap_max"] < 0.05 summary["stable_across_folds"] = stable return summary def evaluate_lr_fold( lr_model_factory, X_tr: pd.Series, y_tr: pd.Series, X_val: pd.Series, y_val: pd.Series, *, augment_fn=None, cfg: dict | None = None, seed: int = 42, ) -> dict[str, float]: """Fit LR on (optionally augmented) fold train; score fold val.""" if augment_fn and cfg is not None: X_fit, y_fit = augment_fn(X_tr, y_tr, cfg, seed=seed) else: X_fit, y_fit = X_tr, y_tr model = lr_model_factory() model.fit(X_fit, y_fit) y_val_arr = y_val.astype(int).values preds_val = model.predict(X_val) preds_train = model.predict(X_fit) probs_val = model.predict_proba(X_val)[:, 1] f1_val = float(f1_score(y_val_arr, preds_val, average="weighted", zero_division=0)) f1_train = float( f1_score(y_fit.astype(int), preds_train, average="weighted", zero_division=0) ) gap = abs(f1_train - f1_val) return { "f1_weighted": f1_val, "f1_train": f1_train, "train_val_gap": round(gap, 4), "train_val_gap_pp": round(gap * 100, 2), "roc_auc": round(float(roc_auc_score(y_val_arr, probs_val)), 4), }