Spaces:

Tskunz
/

ERCOT-Availability-Model

Sleeping

App Files Files Community

ERCOT-Availability-Model / availability_model /time_series_cv.py

Tskunz

Adding the Model and App

7acfdcf verified about 1 month ago

raw

history blame contribute delete

9.08 kB

	"""
	Time Series Cross-Validation

	Implements proper cross-validation for time series data with expanding window
	and no future data leakage.
	"""

	import numpy as np
	import pandas as pd
	from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


	class TimeSeriesSplit:
	"""
	Time series cross-validator with expanding training window.

	For each fold:
	- Training set expands from start to time t
	- Test set is contiguous period from t to t+size
	- No future data in training set
	"""

	def __init__(self, n_splits=5):
	"""
	Initialize TimeSeriesSplit.

	Parameters
	----------
	n_splits : int
	Number of folds
	"""
	self.n_splits = n_splits

	def split(self, X, y=None, groups=None):
	"""
	Generate indices to split data into training and test set.

	Parameters
	----------
	X : array-like
	Data to split
	y : array-like, optional
	Target variable (not used, for sklearn compatibility)
	groups : array-like, optional
	Group labels (not used, for sklearn compatibility)

	Yields
	------
	train : ndarray
	Training indices for the fold
	test : ndarray
	Test indices for the fold
	"""
	n_samples = len(X)
	fold_size = n_samples // (self.n_splits + 1)

	for fold in range(1, self.n_splits + 1):
	# Training indices: from start to fold_size * fold
	train_end = fold_size * fold
	train_indices = np.arange(0, train_end)

	# Test indices: contiguous block after training
	test_start = train_end
	test_end = min(test_start + fold_size, n_samples)
	test_indices = np.arange(test_start, test_end)

	# Skip fold if test set is empty
	if len(test_indices) > 0:
	yield train_indices, test_indices

	def get_n_splits(self, X=None, y=None, groups=None):
	"""Return the number of splitting iterations in the cross-validator."""
	return self.n_splits


	class TimeSeriesCV:
	"""
	Time Series Cross-Validator with metrics calculation.

	Evaluates model across multiple time series folds and computes average metrics.
	"""

	def __init__(self, n_splits=5):
	"""
	Initialize TimeSeriesCV.

	Parameters
	----------
	n_splits : int
	Number of folds for cross-validation
	"""
	self.n_splits = n_splits
	self.splitter = TimeSeriesSplit(n_splits=n_splits)

	def split(self, X, y=None, groups=None):
	"""
	Generate indices to split data into training and test set.

	Parameters
	----------
	X : array-like
	Data to split
	y : array-like, optional
	Target variable (not used, for sklearn compatibility)
	groups : array-like, optional
	Group labels (not used, for sklearn compatibility)

	Yields
	------
	train : ndarray
	Training indices for the fold
	test : ndarray
	Test indices for the fold
	"""
	for train_indices, test_indices in self.splitter.split(X, y, groups):
	yield train_indices, test_indices

	def evaluate(self, model_func, X, y, **fit_params):
	"""
	Evaluate model using cross-validation.

	Parameters
	----------
	model_func : callable
	Function that takes (X_train, y_train) and returns a model with
	.predict() method
	X : array-like
	Feature matrix
	y : array-like
	Target vector
	**fit_params : dict
	Additional parameters for model_func

	Returns
	-------
	results : dict
	Dictionary with average and std metrics across folds
	"""
	metrics_list = {
	'rmse': [],
	'mae': [],
	'mape': [],
	'r2': []
	}

	for fold_idx, (train_indices, test_indices) in enumerate(self.split(X, y), 1):
	# Get train/test data for this fold
	X_fold_train = X.iloc[train_indices] if hasattr(X, 'iloc') else X[train_indices]
	y_fold_train = y.iloc[train_indices] if hasattr(y, 'iloc') else y[train_indices]
	X_fold_test = X.iloc[test_indices] if hasattr(X, 'iloc') else X[test_indices]
	y_fold_test = y.iloc[test_indices] if hasattr(y, 'iloc') else y[test_indices]

	# Train model
	model = model_func(X_fold_train, y_fold_train, **fit_params)

	# Make predictions
	y_pred = model.predict(X_fold_test)

	# Calculate metrics
	rmse = np.sqrt(mean_squared_error(y_fold_test, y_pred))
	mae = mean_absolute_error(y_fold_test, y_pred)
	mape = np.mean(np.abs((y_fold_test - y_pred) / y_fold_test)) * 100
	r2 = r2_score(y_fold_test, y_pred)

	metrics_list['rmse'].append(rmse)
	metrics_list['mae'].append(mae)
	metrics_list['mape'].append(mape)
	metrics_list['r2'].append(r2)

	# Compute statistics
	results = {
	'rmse_mean': np.mean(metrics_list['rmse']),
	'rmse_std': np.std(metrics_list['rmse']),
	'mae_mean': np.mean(metrics_list['mae']),
	'mae_std': np.std(metrics_list['mae']),
	'mape_mean': np.mean(metrics_list['mape']),
	'mape_std': np.std(metrics_list['mape']),
	'r2_mean': np.mean(metrics_list['r2']),
	'r2_std': np.std(metrics_list['r2']),
	'fold_rmse': metrics_list['rmse'],
	'fold_mae': metrics_list['mae'],
	'fold_mape': metrics_list['mape'],
	'fold_r2': metrics_list['r2']
	}

	return results

	def plot_results(self, results, figsize=(14, 8)):
	"""
	Plot cross-validation results.

	Parameters
	----------
	results : dict
	Results from evaluate()
	figsize : tuple
	Figure size (width, height)
	"""
	try:
	import matplotlib.pyplot as plt
	except ImportError:
	print("matplotlib not available, skipping plots")
	return

	fig, axes = plt.subplots(2, 2, figsize=figsize)

	# Plot 1: RMSE across folds
	ax = axes[0, 0]
	folds = np.arange(1, len(results['fold_rmse']) + 1)
	ax.bar(folds, results['fold_rmse'], color='steelblue', alpha=0.7, edgecolor='black')
	ax.axhline(results['rmse_mean'], color='red', linestyle='--', linewidth=2,
	label=f"Mean: {results['rmse_mean']:.2f}")
	ax.set_xlabel('Fold')
	ax.set_ylabel('RMSE (MW)')
	ax.set_title('RMSE Across Folds')
	ax.legend()
	ax.grid(axis='y', alpha=0.3)

	# Plot 2: All metrics
	ax = axes[0, 1]
	metrics = ['RMSE', 'MAE', 'MAPE', 'R²']
	means = [results['rmse_mean'], results['mae_mean'], results['mape_mean'],
	results['r2_mean']]
	stds = [results['rmse_std'], results['mae_std'], results['mape_std'],
	results['r2_std']]
	ax.bar(metrics, means, yerr=stds, capsize=5, color='steelblue', alpha=0.7,
	edgecolor='black')
	ax.set_ylabel('Value')
	ax.set_title('Average Metrics ± Std Dev')
	ax.grid(axis='y', alpha=0.3)

	# Plot 3: R² across folds
	ax = axes[1, 0]
	ax.plot(folds, results['fold_r2'], marker='o', linestyle='-', linewidth=2,
	markersize=8, color='green', alpha=0.7)
	ax.axhline(results['r2_mean'], color='red', linestyle='--', linewidth=2,
	label=f"Mean: {results['r2_mean']:.4f}")
	ax.set_xlabel('Fold')
	ax.set_ylabel('R² Score')
	ax.set_title('R² Score Across Folds')
	ax.set_ylim([0, 1])
	ax.legend()
	ax.grid(alpha=0.3)

	# Plot 4: Metric variation
	ax = axes[1, 1]
	variation = [
	(results['rmse_std'] / results['rmse_mean']) * 100,
	(results['mae_std'] / results['mae_mean']) * 100,
	(results['mape_std'] / results['mape_mean']) * 100,
	(results['r2_std'] / results['r2_mean']) * 100 if results['r2_mean'] > 0 else 0
	]
	colors = ['green' if v < 15 else 'orange' for v in variation]
	ax.bar(metrics, variation, color=colors, alpha=0.7, edgecolor='black')
	ax.axhline(15, color='red', linestyle='--', linewidth=2, label='15% threshold')
	ax.set_ylabel('Variation (%)')
	ax.set_title('Metric Stability Across Folds')
	ax.legend()
	ax.grid(axis='y', alpha=0.3)

	plt.tight_layout()
	return fig