pq / test_predictor_statsforecast.py
Daniel Varga
moving v2 to .
0202035
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsforecast import StatsForecast
from statsforecast.models import (
ARIMA,
AutoARIMA,
AutoETS,
AutoCES,
DynamicOptimizedTheta,
MSTL,
SeasonalNaive,
)
from datasetsforecast.losses import rmse, mae
os.environ["NIXTLA_NUMBA_RELEASE_GIL"] = "1"
os.environ["NIXTLA_NUMBA_CACHE"] = "1"
data = pd.read_csv("terheles_fixed.tsv", sep="\t")
data['ds'] = pd.to_datetime(data['Korrigált időpont'])
data['y'] = data['Hatásos teljesítmény']
data = data[['ds', 'y']]
data['unique_id'] = 1
data = data[data['ds'] < '2019-03-01']
Y_df = data
train_df = Y_df[Y_df['ds'] < '2019-02-01']
horizon = 4 * 24 * 7 # 7 days
def ensemble_forecasts(
fcsts_df,
quantiles,
name_models,
model_name,
) -> pd.DataFrame:
fcsts_df[model_name] = fcsts_df[name_models].mean(axis=1).values # type: ignore
# compute quantiles based on the mean of the forecasts
sigma_models = []
for model in name_models:
fcsts_df[f"sigma_{model}"] = fcsts_df[f"{model}-hi-68.27"] - fcsts_df[model]
sigma_models.append(f"sigma_{model}")
fcsts_df[f"std_{model_name}"] = (
fcsts_df[sigma_models].pow(2).sum(axis=1).div(len(sigma_models) ** 2).pow(0.5)
)
z = norm.ppf(quantiles)
q_cols = []
for q, zq in zip(quantiles, z):
q_col = f"{model_name}-q-{q}"
fcsts_df[q_col] = fcsts_df[model_name] + zq * fcsts_df[f"std_{model_name}"]
q_cols.append(q_col)
fcsts_df = fcsts_df[["unique_id", "ds"] + [model_name] + q_cols]
return fcsts_df
def run_statistical_ensemble(
train_df: pd.DataFrame,
horizon: int,
freq: str,
seasonality: int,
quantiles,
):
os.environ["NIXTLA_ID_AS_COL"] = "true"
models = [
AutoARIMA(season_length=seasonality),
AutoETS(season_length=seasonality),
AutoCES(season_length=seasonality),
DynamicOptimizedTheta(season_length=seasonality),
]
init_time = time()
series_per_core = 15
n_series = train_df["unique_id"].nunique()
n_jobs = min(n_series // series_per_core, os.cpu_count())
sf = StatsForecast(
models=models,
freq=freq,
n_jobs=n_jobs,
)
fcsts_df = sf.forecast(df=train_df, h=horizon, level=[68.27])
name_models = [repr(model) for model in models]
model_name = "StatisticalEnsemble"
fcsts_df = ensemble_forecasts(
fcsts_df,
quantiles,
name_models,
model_name,
)
total_time = time() - init_time
return fcsts_df, total_time, model_name
# unlike MSTL, the others only allow a single season_length:
seasonality = 4 * 24 * 1 # 1 day
models = [
MSTL(
season_length=[4 * 24, 4 * 24 * 7], # seasonalities of the time series
trend_forecaster=AutoARIMA() # model used to forecast trend
),
SeasonalNaive(season_length=seasonality)
]
EXTENDED_TEST = True
if EXTENDED_TEST:
models += [
# AutoARIMA(season_length=4 * 24) is just too slow, never even finishes,
# spends all its time in scipy bfgs.
# which is weird, because it's works okay as trend-detector of MSTL.
AutoARIMA(),
AutoETS(season_length=seasonality),
AutoCES(season_length=seasonality),
DynamicOptimizedTheta(season_length=seasonality),
]
freq = '15min'
sf = StatsForecast(
models=models,
freq=freq,
n_jobs=-1,
)
model_names = [repr(model) for model in models]
n_windows = len(train_df) // horizon - 1
print("crossvalidation with", n_windows, "windows")
print("models:", ", ".join(model_names))
crossvalidation_df = sf.cross_validation(df=train_df, h=horizon, step_size=horizon, n_windows=n_windows)
for model_name in model_names:
rmse_crossval = rmse(crossvalidation_df['y'], crossvalidation_df[model_name])
mae_crossval = mae(crossvalidation_df['y'], crossvalidation_df[model_name])
print(model_name, "RMSE", rmse_crossval, "MAE", mae_crossval)
exit()
sf.fit(train_df)
sf.fitted_[0, 0].model_.tail(4 * 24 * 7 * 2).plot(subplots=True, grid=True)
plt.tight_layout()
plt.show()
print("starting forecast, dataset size", len(train_df))
# Y_hat_df = sf.forecast(df=train_df, h=horizon, level=[68.27])
Y_hat_df = sf.predict(h=horizon, level=[68.27])
print(Y_hat_df)
Y_hat_df = Y_hat_df.reset_index()
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
# plot_df = pd.concat([Y_df, Y_hat_df]).set_index('ds') # Concatenate the train and forecast dataframes
# plot_df[['y', 'LSTM', 'NHITS']].plot(ax=ax, linewidth=2)
plot_Y_df = Y_df # [Y_df['ds'] > '2019-07-01']
plot_Y_df = plot_Y_df.set_index('ds')[['y']]
plot_Y_df.plot(ax=ax, linewidth=1)
Y_hat_df.set_index('ds').plot(ax=ax, linewidth=1)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
plt.savefig("neuralforecast.pdf")
exit()
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
fcst_df, total_time, model_name = run_statistical_ensemble(
train_df,
horizon=horizon,
freq='15m',
seasonality=4 * 24 * 7,
quantiles=quantiles
)
nf.fit(df=train_df)
Y_hat_df = nf.predict()
print(Y_df)
Y_hat_df = Y_hat_df.reset_index()
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
# plot_df = pd.concat([Y_df, Y_hat_df]).set_index('ds') # Concatenate the train and forecast dataframes
# plot_df[['y', 'LSTM', 'NHITS']].plot(ax=ax, linewidth=2)
plot_Y_df = Y_df[Y_df['ds'] > '2019-07-01']
plot_Y_df = plot_Y_df.set_index('ds')[['y']]
plot_Y_df.plot(ax=ax, linewidth=1)
Y_hat_df.set_index('ds')[['PatchTST', 'NHITS']].plot(ax=ax, linewidth=1)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
plt.savefig("neuralforecast.pdf")