Spaces:

Akshay4506
/

ModelMatrix

Running

App Files Files Community

ModelMatrix / matrix /code /evaluation /cross_validation.py

Akshay4506

Fix deployment entry point and merge requirements

c4ff02d 4 days ago

raw

history blame contribute delete

3.54 kB

	"""
	Cross-Validation
	================

	10-fold stratified cross-validation for model evaluation.

	Author: UW MSIM Team
	Date: November 2025
	"""

	import numpy as np
	import pandas as pd
	from sklearn.model_selection import StratifiedKFold, KFold
	from sklearn.preprocessing import LabelEncoder
	from typing import List, Dict
	import logging

	from .metrics import calculate_classification_metrics, calculate_regression_metrics

	logger = logging.getLogger(__name__)


	def _encode_categorical_columns(X_train, X_val):
	"""
	Label-encode object/categorical columns. Fitted on X_train,
	applied to both X_train and X_val. Unknown categories in X_val
	are mapped to -1.
	"""
	X_train = X_train.copy()
	X_val = X_val.copy()

	cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
	if len(cat_cols) == 0:
	return X_train, X_val

	logger.info(f" Encoding {len(cat_cols)} categorical columns: {list(cat_cols[:5])}{'...' if len(cat_cols) > 5 else ''}")

	for col in cat_cols:
	le = LabelEncoder()
	# Fit on combined unique values from train (+ handle unseen in val)
	combined = pd.concat([X_train[col], X_val[col]], axis=0).astype(str)
	le.fit(combined)
	X_train[col] = le.transform(X_train[col].astype(str))
	X_val[col] = le.transform(X_val[col].astype(str))

	return X_train, X_val


	def run_cross_validation(
	model,
	X: pd.DataFrame,
	y: pd.Series,
	task_type: str = 'classification',
	n_folds: int = 10,
	random_state: int = 42
	) -> List[Dict]:
	"""
	Run k-fold cross-validation.

	Parameters
	----------
	model : BaseModelWrapper
	Model to evaluate (must have fit/predict methods)
	X : pd.DataFrame
	Features
	y : pd.Series
	Target
	task_type : str
	'classification' or 'regression'
	n_folds : int
	Number of folds
	random_state : int
	Random seed

	Returns
	-------
	fold_results : list of dict
	Results for each fold
	"""
	logger.info(f"Running {n_folds}-fold CV for {model.__class__.__name__}")

	# Choose CV splitter
	if task_type == 'classification':
	cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
	else:
	cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

	fold_results = []

	for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
	logger.info(f" Fold {fold_idx + 1}/{n_folds}")

	# Split data
	X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
	y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

	# Auto-encode categorical columns so tree models can handle them
	X_train, X_val = _encode_categorical_columns(X_train, X_val)

	# Fit model
	model.fit(X_train, y_train)

	# Predict
	y_pred = model.predict(X_val)
	y_proba = None
	if task_type == 'classification':
	try:
	y_proba = model.predict_proba(X_val)
	except:
	pass

	# Calculate metrics
	if task_type == 'classification':
	metrics = calculate_classification_metrics(y_val, y_pred, y_proba)
	else:
	metrics = calculate_regression_metrics(y_val, y_pred)

	# Add timing info
	metrics.update({
	'fold': fold_idx,
	'fit_time': model.fit_time,
	'predict_time': model.predict_time
	})

	fold_results.append(metrics)

	return fold_results