File size: 3,540 Bytes
e057d08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Cross-Validation
================

10-fold stratified cross-validation for model evaluation.

Author: UW MSIM Team
Date: November 2025
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from typing import List, Dict
import logging

from .metrics import calculate_classification_metrics, calculate_regression_metrics

logger = logging.getLogger(__name__)


def _encode_categorical_columns(X_train, X_val):
    """
    Label-encode object/categorical columns. Fitted on X_train,
    applied to both X_train and X_val. Unknown categories in X_val
    are mapped to -1.
    """
    X_train = X_train.copy()
    X_val = X_val.copy()

    cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) == 0:
        return X_train, X_val

    logger.info(f"  Encoding {len(cat_cols)} categorical columns: {list(cat_cols[:5])}{'...' if len(cat_cols) > 5 else ''}")

    for col in cat_cols:
        le = LabelEncoder()
        # Fit on combined unique values from train (+ handle unseen in val)
        combined = pd.concat([X_train[col], X_val[col]], axis=0).astype(str)
        le.fit(combined)
        X_train[col] = le.transform(X_train[col].astype(str))
        X_val[col] = le.transform(X_val[col].astype(str))

    return X_train, X_val


def run_cross_validation(
    model,
    X: pd.DataFrame,
    y: pd.Series,
    task_type: str = 'classification',
    n_folds: int = 10,
    random_state: int = 42
) -> List[Dict]:
    """
    Run k-fold cross-validation.

    Parameters
    ----------
    model : BaseModelWrapper
        Model to evaluate (must have fit/predict methods)
    X : pd.DataFrame
        Features
    y : pd.Series
        Target
    task_type : str
        'classification' or 'regression'
    n_folds : int
        Number of folds
    random_state : int
        Random seed

    Returns
    -------
    fold_results : list of dict
        Results for each fold
    """
    logger.info(f"Running {n_folds}-fold CV for {model.__class__.__name__}")

    # Choose CV splitter
    if task_type == 'classification':
        cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    else:
        cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

    fold_results = []

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        logger.info(f"  Fold {fold_idx + 1}/{n_folds}")

        # Split data
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Auto-encode categorical columns so tree models can handle them
        X_train, X_val = _encode_categorical_columns(X_train, X_val)

        # Fit model
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_val)
        y_proba = None
        if task_type == 'classification':
            try:
                y_proba = model.predict_proba(X_val)
            except:
                pass

        # Calculate metrics
        if task_type == 'classification':
            metrics = calculate_classification_metrics(y_val, y_pred, y_proba)
        else:
            metrics = calculate_regression_metrics(y_val, y_pred)

        # Add timing info
        metrics.update({
            'fold': fold_idx,
            'fit_time': model.fit_time,
            'predict_time': model.predict_time
        })

        fold_results.append(metrics)

    return fold_results