ModelMatrix / matrix /code /evaluation /cross_validation.py
Akshay4506's picture
Fix deployment entry point and merge requirements
c4ff02d
"""
Cross-Validation
================
10-fold stratified cross-validation for model evaluation.
Author: UW MSIM Team
Date: November 2025
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from typing import List, Dict
import logging
from .metrics import calculate_classification_metrics, calculate_regression_metrics
logger = logging.getLogger(__name__)
def _encode_categorical_columns(X_train, X_val):
"""
Label-encode object/categorical columns. Fitted on X_train,
applied to both X_train and X_val. Unknown categories in X_val
are mapped to -1.
"""
X_train = X_train.copy()
X_val = X_val.copy()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
if len(cat_cols) == 0:
return X_train, X_val
logger.info(f" Encoding {len(cat_cols)} categorical columns: {list(cat_cols[:5])}{'...' if len(cat_cols) > 5 else ''}")
for col in cat_cols:
le = LabelEncoder()
# Fit on combined unique values from train (+ handle unseen in val)
combined = pd.concat([X_train[col], X_val[col]], axis=0).astype(str)
le.fit(combined)
X_train[col] = le.transform(X_train[col].astype(str))
X_val[col] = le.transform(X_val[col].astype(str))
return X_train, X_val
def run_cross_validation(
model,
X: pd.DataFrame,
y: pd.Series,
task_type: str = 'classification',
n_folds: int = 10,
random_state: int = 42
) -> List[Dict]:
"""
Run k-fold cross-validation.
Parameters
----------
model : BaseModelWrapper
Model to evaluate (must have fit/predict methods)
X : pd.DataFrame
Features
y : pd.Series
Target
task_type : str
'classification' or 'regression'
n_folds : int
Number of folds
random_state : int
Random seed
Returns
-------
fold_results : list of dict
Results for each fold
"""
logger.info(f"Running {n_folds}-fold CV for {model.__class__.__name__}")
# Choose CV splitter
if task_type == 'classification':
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
else:
cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
fold_results = []
for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
logger.info(f" Fold {fold_idx + 1}/{n_folds}")
# Split data
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
# Auto-encode categorical columns so tree models can handle them
X_train, X_val = _encode_categorical_columns(X_train, X_val)
# Fit model
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_val)
y_proba = None
if task_type == 'classification':
try:
y_proba = model.predict_proba(X_val)
except:
pass
# Calculate metrics
if task_type == 'classification':
metrics = calculate_classification_metrics(y_val, y_pred, y_proba)
else:
metrics = calculate_regression_metrics(y_val, y_pred)
# Add timing info
metrics.update({
'fold': fold_idx,
'fit_time': model.fit_time,
'predict_time': model.predict_time
})
fold_results.append(metrics)
return fold_results