Spaces:
Sleeping
Sleeping
File size: 5,266 Bytes
e4f4b02 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | # %%
import pandas as pd
import os
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import itertools
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.filters.hp_filter import hpfilter
from numpy.polynomial.polynomial import Polynomial
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
req_naics = 336111
df = pd.read_csv(f"../data/processed_data_{req_naics}.csv")
print(df.head())
# Convert 'Year' column to datetime if it's not already
df['year'] = pd.to_datetime(df['year'], format='%Y')
# Select the column to decompose (e.g., 'VSHIP' for turnover forecasting)
series = df['vship'] # Replace with the relevant column
plot_acf(series, lags=20) # See if there’s a peak around lag=12
plt.show()
freq_spectrum = np.fft.fft(series.dropna())
plt.plot(np.abs(freq_spectrum))
plt.title("Frequency Spectrum")
plt.show()
# Apply Hodrick-Prescott Filter to extract trend
cycle, trend = hpfilter(series, lamb=1600) # lambda=1600 is common for annual data
# Plot trend vs original series
plt.figure(figsize=(10, 5))
plt.plot(series, label="Original Series", color="blue", alpha=0.6)
plt.plot(trend, label="Extracted Trend (HP Filter)", color="red", linewidth=2)
plt.legend()
plt.title("Trend Extraction Using HP Filter")
plt.show()
# Create time index
X = np.arange(len(series)) # Convert years to numerical values
y = series.values
# Fit a 2nd-degree polynomial trend model
p = Polynomial.fit(X, y, deg=5)
# Plot trend
plt.figure(figsize=(10, 5))
plt.plot(series, label="Original Series", color="blue", alpha=0.6)
plt.plot(series.index, p(X), label="Quadratic Trend", color="red", linewidth=2)
plt.legend()
plt.title("Polynomial Trend Fitting")
plt.show()
series_diff = series.diff().dropna()
# Recheck ACF after differencing
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(series_diff, lags=20)
plt.show()
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(series_diff)
print(f"ADF Statistic: {adf_result[0]}")
print(f"P-value: {adf_result[1]}")
if adf_result[1] < 0.05:
print("Series is stationary after differencing.")
else:
print("Series is still non-stationary, further differencing")
# Apply second-order differencing
series_diff2 = series.diff().diff().dropna()
# Recheck ACF after second differencing
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(series_diff2, lags=20)
plt.show()
# Perform ADF test again
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(series_diff2)
print(f"ADF Statistic: {adf_result[0]}")
print(f"P-value: {adf_result[1]}")
if adf_result[1] < 0.05:
print("Series is now stationary after second differencing.")
else:
print("Series is still non-stationary, further transformations may be needed.")
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# ACF (for q selection)
sm.graphics.tsa.plot_acf(series.diff().diff().dropna(), lags=20, ax=axes[0])
axes[0].set_title("Autocorrelation Function (ACF)")
# PACF (for p selection)
sm.graphics.tsa.plot_pacf(series.diff().diff().dropna(), lags=20, ax=axes[1])
axes[1].set_title("Partial Autocorrelation Function (PACF)")
plt.show()
# Assuming 'series' is your Pandas Series with a DateTime index
train_size = int(len(series) * 0.8) # 80% train, 20% holdout
train, holdout = series.iloc[:train_size], series.iloc[train_size:]
# Plot train/holdout split
plt.figure(figsize=(10, 4))
plt.plot(train, label="Training Data")
plt.plot(holdout, label="Holdout Data", color="red")
plt.title("Train-Holdout Split")
plt.legend()
plt.show()
tscv = TimeSeriesSplit(n_splits=5) # 5 folds
p, d, q = 1, 2, 2 # Set based on ACF/PACF analysis
garch_p, garch_q = 1, 1 # GARCH hyperparameters
arima_errors = []
garch_volatility_forecasts = []
for train_idx, test_idx in tscv.split(train):
train_fold, test_fold = train.iloc[train_idx], train.iloc[test_idx]
# Train ARIMA model on expanding training set
arima_model = ARIMA(train_fold, order=(p, d, q))
arima_fit = arima_model.fit()
# Forecast ARIMA on test fold
arima_forecast = arima_fit.forecast(steps=len(test_fold))
# Calculate MAE for ARIMA prediction
error = mean_absolute_error(test_fold, arima_forecast)
arima_errors.append(error)
# Extract ARIMA residuals
residuals = arima_fit.resid
# Fit GARCH model on residuals
garch_model = arch_model(residuals, vol='Garch', p=garch_p, q=garch_q)
garch_fit = garch_model.fit(disp="off")
# Forecast volatility for the test fold
garch_forecast = garch_fit.forecast(start=len(residuals), horizon=len(test_fold))
volatility_forecast = np.sqrt(garch_forecast.variance.iloc[-1]) # Extract volatility
garch_volatility_forecasts.append(volatility_forecast)
# Print cross-validation errors
print(f"Average ARIMA MAE Across Folds: {np.mean(arima_errors)}") |