Spaces:
Running
Running
| """ | |
| Cross-Validation | |
| ================ | |
| 10-fold stratified cross-validation for model evaluation. | |
| Author: UW MSIM Team | |
| Date: November 2025 | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.model_selection import StratifiedKFold, KFold | |
| from sklearn.preprocessing import LabelEncoder | |
| from typing import List, Dict | |
| import logging | |
| from .metrics import calculate_classification_metrics, calculate_regression_metrics | |
| logger = logging.getLogger(__name__) | |
| def _encode_categorical_columns(X_train, X_val): | |
| """ | |
| Label-encode object/categorical columns. Fitted on X_train, | |
| applied to both X_train and X_val. Unknown categories in X_val | |
| are mapped to -1. | |
| """ | |
| X_train = X_train.copy() | |
| X_val = X_val.copy() | |
| cat_cols = X_train.select_dtypes(include=['object', 'category']).columns | |
| if len(cat_cols) == 0: | |
| return X_train, X_val | |
| logger.info(f" Encoding {len(cat_cols)} categorical columns: {list(cat_cols[:5])}{'...' if len(cat_cols) > 5 else ''}") | |
| for col in cat_cols: | |
| le = LabelEncoder() | |
| # Fit on combined unique values from train (+ handle unseen in val) | |
| combined = pd.concat([X_train[col], X_val[col]], axis=0).astype(str) | |
| le.fit(combined) | |
| X_train[col] = le.transform(X_train[col].astype(str)) | |
| X_val[col] = le.transform(X_val[col].astype(str)) | |
| return X_train, X_val | |
| def run_cross_validation( | |
| model, | |
| X: pd.DataFrame, | |
| y: pd.Series, | |
| task_type: str = 'classification', | |
| n_folds: int = 10, | |
| random_state: int = 42 | |
| ) -> List[Dict]: | |
| """ | |
| Run k-fold cross-validation. | |
| Parameters | |
| ---------- | |
| model : BaseModelWrapper | |
| Model to evaluate (must have fit/predict methods) | |
| X : pd.DataFrame | |
| Features | |
| y : pd.Series | |
| Target | |
| task_type : str | |
| 'classification' or 'regression' | |
| n_folds : int | |
| Number of folds | |
| random_state : int | |
| Random seed | |
| Returns | |
| ------- | |
| fold_results : list of dict | |
| Results for each fold | |
| """ | |
| logger.info(f"Running {n_folds}-fold CV for {model.__class__.__name__}") | |
| # Choose CV splitter | |
| if task_type == 'classification': | |
| cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state) | |
| else: | |
| cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state) | |
| fold_results = [] | |
| for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)): | |
| logger.info(f" Fold {fold_idx + 1}/{n_folds}") | |
| # Split data | |
| X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] | |
| y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] | |
| # Auto-encode categorical columns so tree models can handle them | |
| X_train, X_val = _encode_categorical_columns(X_train, X_val) | |
| # Fit model | |
| model.fit(X_train, y_train) | |
| # Predict | |
| y_pred = model.predict(X_val) | |
| y_proba = None | |
| if task_type == 'classification': | |
| try: | |
| y_proba = model.predict_proba(X_val) | |
| except: | |
| pass | |
| # Calculate metrics | |
| if task_type == 'classification': | |
| metrics = calculate_classification_metrics(y_val, y_pred, y_proba) | |
| else: | |
| metrics = calculate_regression_metrics(y_val, y_pred) | |
| # Add timing info | |
| metrics.update({ | |
| 'fold': fold_idx, | |
| 'fit_time': model.fit_time, | |
| 'predict_time': model.predict_time | |
| }) | |
| fold_results.append(metrics) | |
| return fold_results | |