Spaces:

zq13648
/

demand-forecasting

Runtime error

demand-forecasting / src /forecaster /utils /prep_data.py

zhang qiao

Upload folder using huggingface_hub

8cf4695 about 2 years ago

3.21 kB

	import math

	import pandas as pd
	from sktime.forecasting.base import ForecastingHorizon


	def split_x_y(
	data: pd.DataFrame,
	window_length: int,
	n_predict: int,
	freq: str,
	):
	# print('[prep_data] ----- Start -----')
	datetime_index = data.index
	y = data['y']
	X_train, X_forecast = None, None

	has_X = len(data.columns) > 1

	if has_X:
	# print('[prep_data] - additional feature columns found')

	X = data.drop(columns='y').reset_index(drop=True)
	X_columns = X.columns

	X_train = pd.DataFrame()

	# ------------------------ #
	# Build lags of the X data #
	# ------------------------ #

	# print('[prep_data] - Building lags of features data')
	for n in range(0, window_length):
	# print('[prep_data],', n)
	shifted_columns = {}
	for col in X_columns:
	shifted_columns[col] = f'{col}_-{n_predict + n}'

	shifted = X.shift(n).rename(columns=shifted_columns)

	X_train = pd.concat(
	[X_train, shifted],
	axis=1)
	# print('[prep_data],', X_train)

	# print('[prep_data] - Backward fill lags of exog data')
	X_train = X_train.bfill()

	# Split last n_predict rows from exog_train as exog_pred
	X_forecast = X_train[-n_predict:]
	X_train = X_train[:-n_predict]

	# For both y and datetime index, need to cut off n_predict value to keep data consistent
	# print('[prep_data] - Cutting off y and datetime index be n_predict')
	y = y[n_predict:]
	datetime_index = datetime_index[n_predict:]

	X_train.set_index(datetime_index, inplace=True)

	fh = ForecastingHorizon(
	list(range(1, n_predict+1)), is_relative=True, freq=freq)
	# Cutoff is the last datetime value in the given data
	# meaning we'll forecast right after this point of time
	cutoff = datetime_index[-1]
	fh = fh.to_absolute(cutoff=cutoff)

	if X_forecast is not None:
	X_forecast.set_index(fh.to_pandas(), inplace=True)

	return (fh, y, X_train, X_forecast)


	def k_folds(
	data: pd.DataFrame,
	period: int,
	window_length: int,
	n_predict: int,
	freq: str
	):
	'''
	Amount of folds for testing is data size - window length and 2 seasonality period
	This will make sure the smallest fold will still have 2 seasons and n_predict value, these will be sufficient to train a minimal model
	'''
	print('[k_folds] ----- START -----')
	k = math.floor((len(data) - n_predict - (2*period)) / period)
	folds = []
	print('k', k)

	# Make sure k is not large than 10
	k = min(k, 10)

	if k == 0:
	raise ValueError(
	f'Data should at least have length of 2 seasons + n_predict rows, \
	currently length {len(data)}, expected length {2 * period + n_predict}')

	for i in reversed(range(1, k + 1)):
	d = data[: (-i * period)]
	folds.append(
	split_x_y(
	d,
	window_length,
	n_predict,
	freq
	))

	print('[k_folds] ----- END -----')
	return folds