Spaces:

Akshay4506
/

ModelMatrix

Running

File size: 3,540 Bytes

e057d08

"""
Cross-Validation
================

10-fold stratified cross-validation for model evaluation.

Author: UW MSIM Team
Date: November 2025
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from typing import List, Dict
import logging

from .metrics import calculate_classification_metrics, calculate_regression_metrics

logger = logging.getLogger(__name__)


def _encode_categorical_columns(X_train, X_val):
    """
    Label-encode object/categorical columns. Fitted on X_train,
    applied to both X_train and X_val. Unknown categories in X_val
    are mapped to -1.
    """
    X_train = X_train.copy()
    X_val = X_val.copy()

    cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) == 0:
        return X_train, X_val

    logger.info(f"  Encoding {len(cat_cols)} categorical columns: {list(cat_cols[:5])}{'...' if len(cat_cols) > 5 else ''}")

    for col in cat_cols:
        le = LabelEncoder()
        # Fit on combined unique values from train (+ handle unseen in val)
        combined = pd.concat([X_train[col], X_val[col]], axis=0).astype(str)
        le.fit(combined)
        X_train[col] = le.transform(X_train[col].astype(str))
        X_val[col] = le.transform(X_val[col].astype(str))

    return X_train, X_val


def run_cross_validation(
    model,
    X: pd.DataFrame,
    y: pd.Series,
    task_type: str = 'classification',
    n_folds: int = 10,
    random_state: int = 42
) -> List[Dict]:
    """
    Run k-fold cross-validation.

    Parameters
    ----------
    model : BaseModelWrapper
        Model to evaluate (must have fit/predict methods)
    X : pd.DataFrame
        Features
    y : pd.Series
        Target
    task_type : str
        'classification' or 'regression'
    n_folds : int
        Number of folds
    random_state : int
        Random seed

    Returns
    -------
    fold_results : list of dict
        Results for each fold
    """
    logger.info(f"Running {n_folds}-fold CV for {model.__class__.__name__}")

    # Choose CV splitter
    if task_type == 'classification':
        cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    else:
        cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

    fold_results = []

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        logger.info(f"  Fold {fold_idx + 1}/{n_folds}")

        # Split data
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Auto-encode categorical columns so tree models can handle them
        X_train, X_val = _encode_categorical_columns(X_train, X_val)

        # Fit model
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_val)
        y_proba = None
        if task_type == 'classification':
            try:
                y_proba = model.predict_proba(X_val)
            except:
                pass

        # Calculate metrics
        if task_type == 'classification':
            metrics = calculate_classification_metrics(y_val, y_pred, y_proba)
        else:
            metrics = calculate_regression_metrics(y_val, y_pred)

        # Add timing info
        metrics.update({
            'fold': fold_idx,
            'fit_time': model.fit_time,
            'predict_time': model.predict_time
        })

        fold_results.append(metrics)

    return fold_results