GitHub Actions
🚀 Deploying to Hugging Face Space: TurnoverForecasting
e4f4b02
# %%
import pandas as pd
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import itertools
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.filters.hp_filter import hpfilter
from numpy.polynomial.polynomial import Polynomial
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
req_naics = 336111
df = pd.read_csv(f"../data/processed_data_{req_naics}.csv")
print(df.head())
# Convert 'Year' column to datetime if it's not already
df['year'] = pd.to_datetime(df['year'], format='%Y')
# Select the column to decompose (e.g., 'VSHIP' for turnover forecasting)
series = df['vship'] # Replace with the relevant column
plot_acf(series, lags=20) # See if there’s a peak around lag=12
plt.show()
freq_spectrum = np.fft.fft(series.dropna())
plt.plot(np.abs(freq_spectrum))
plt.title("Frequency Spectrum")
plt.show()
# Apply Hodrick-Prescott Filter to extract trend
cycle, trend = hpfilter(series, lamb=1600) # lambda=1600 is common for annual data
# Plot trend vs original series
plt.figure(figsize=(10, 5))
plt.plot(series, label="Original Series", color="blue", alpha=0.6)
plt.plot(trend, label="Extracted Trend (HP Filter)", color="red", linewidth=2)
plt.legend()
plt.title("Trend Extraction Using HP Filter")
plt.show()
# Create time index
X = np.arange(len(series)) # Convert years to numerical values
y = series.values
# Fit a 2nd-degree polynomial trend model
p = Polynomial.fit(X, y, deg=5)
# Plot trend
plt.figure(figsize=(10, 5))
plt.plot(series, label="Original Series", color="blue", alpha=0.6)
plt.plot(series.index, p(X), label="Quadratic Trend", color="red", linewidth=2)
plt.legend()
plt.title("Polynomial Trend Fitting")
plt.show()
series_diff = series.diff().dropna()
# Recheck ACF after differencing
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(series_diff, lags=20)
plt.show()
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(series_diff)
print(f"ADF Statistic: {adf_result[0]}")
print(f"P-value: {adf_result[1]}")
if adf_result[1] < 0.05:
print("Series is stationary after differencing.")
else:
print("Series is still non-stationary, further differencing")
# Apply second-order differencing
series_diff2 = series.diff().diff().dropna()
# Recheck ACF after second differencing
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(series_diff2, lags=20)
plt.show()
# Perform ADF test again
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(series_diff2)
print(f"ADF Statistic: {adf_result[0]}")
print(f"P-value: {adf_result[1]}")
if adf_result[1] < 0.05:
print("Series is now stationary after second differencing.")
else:
print("Series is still non-stationary, further transformations may be needed.")
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# ACF (for q selection)
sm.graphics.tsa.plot_acf(series.diff().diff().dropna(), lags=20, ax=axes[0])
axes[0].set_title("Autocorrelation Function (ACF)")
# PACF (for p selection)
sm.graphics.tsa.plot_pacf(series.diff().diff().dropna(), lags=20, ax=axes[1])
axes[1].set_title("Partial Autocorrelation Function (PACF)")
plt.show()
# Assuming 'series' is your Pandas Series with a DateTime index
train_size = int(len(series) * 0.8) # 80% train, 20% holdout
train, holdout = series.iloc[:train_size], series.iloc[train_size:]
# Plot train/holdout split
plt.figure(figsize=(10, 4))
plt.plot(train, label="Training Data")
plt.plot(holdout, label="Holdout Data", color="red")
plt.title("Train-Holdout Split")
plt.legend()
plt.show()
tscv = TimeSeriesSplit(n_splits=5) # 5 folds
p, d, q = 1, 2, 2 # Set based on ACF/PACF analysis
garch_p, garch_q = 1, 1 # GARCH hyperparameters
arima_errors = []
garch_volatility_forecasts = []
for train_idx, test_idx in tscv.split(train):
train_fold, test_fold = train.iloc[train_idx], train.iloc[test_idx]
# Train ARIMA model on expanding training set
arima_model = ARIMA(train_fold, order=(p, d, q))
arima_fit = arima_model.fit()
# Forecast ARIMA on test fold
arima_forecast = arima_fit.forecast(steps=len(test_fold))
# Calculate MAE for ARIMA prediction
error = mean_absolute_error(test_fold, arima_forecast)
arima_errors.append(error)
# Extract ARIMA residuals
residuals = arima_fit.resid
# Fit GARCH model on residuals
garch_model = arch_model(residuals, vol='Garch', p=garch_p, q=garch_q)
garch_fit = garch_model.fit(disp="off")
# Forecast volatility for the test fold
garch_forecast = garch_fit.forecast(start=len(residuals), horizon=len(test_fold))
volatility_forecast = np.sqrt(garch_forecast.variance.iloc[-1]) # Extract volatility
garch_volatility_forecasts.append(volatility_forecast)
# Print cross-validation errors
print(f"Average ARIMA MAE Across Folds: {np.mean(arima_errors)}")