File size: 5,266 Bytes
e4f4b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# %%
import pandas as pd
import os 
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import itertools
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.filters.hp_filter import hpfilter
from numpy.polynomial.polynomial import Polynomial
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model

req_naics = 336111
df = pd.read_csv(f"../data/processed_data_{req_naics}.csv")

print(df.head())
# Convert 'Year' column to datetime if it's not already
df['year'] = pd.to_datetime(df['year'], format='%Y')

# Select the column to decompose (e.g., 'VSHIP' for turnover forecasting)
series = df['vship']  # Replace with the relevant column


plot_acf(series, lags=20)  # See if there’s a peak around lag=12
plt.show()

freq_spectrum = np.fft.fft(series.dropna())
plt.plot(np.abs(freq_spectrum))
plt.title("Frequency Spectrum")
plt.show()


# Apply Hodrick-Prescott Filter to extract trend
cycle, trend = hpfilter(series, lamb=1600)  # lambda=1600 is common for annual data

# Plot trend vs original series
plt.figure(figsize=(10, 5))
plt.plot(series, label="Original Series", color="blue", alpha=0.6)
plt.plot(trend, label="Extracted Trend (HP Filter)", color="red", linewidth=2)
plt.legend()
plt.title("Trend Extraction Using HP Filter")
plt.show()

# Create time index
X = np.arange(len(series))  # Convert years to numerical values
y = series.values

# Fit a 2nd-degree polynomial trend model
p = Polynomial.fit(X, y, deg=5)

# Plot trend
plt.figure(figsize=(10, 5))
plt.plot(series, label="Original Series", color="blue", alpha=0.6)
plt.plot(series.index, p(X), label="Quadratic Trend", color="red", linewidth=2)
plt.legend()
plt.title("Polynomial Trend Fitting")
plt.show()

series_diff = series.diff().dropna()

# Recheck ACF after differencing
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(series_diff, lags=20)
plt.show()

from statsmodels.tsa.stattools import adfuller

adf_result = adfuller(series_diff)
print(f"ADF Statistic: {adf_result[0]}")
print(f"P-value: {adf_result[1]}")

if adf_result[1] < 0.05:
    print("Series is stationary after differencing.")
else:
    print("Series is still non-stationary, further differencing")

# Apply second-order differencing
series_diff2 = series.diff().diff().dropna()

# Recheck ACF after second differencing
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(series_diff2, lags=20)
plt.show()

# Perform ADF test again
from statsmodels.tsa.stattools import adfuller

adf_result = adfuller(series_diff2)
print(f"ADF Statistic: {adf_result[0]}")
print(f"P-value: {adf_result[1]}")

if adf_result[1] < 0.05:
    print("Series is now stationary after second differencing.")
else:
    print("Series is still non-stationary, further transformations may be needed.")


fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# ACF (for q selection)
sm.graphics.tsa.plot_acf(series.diff().diff().dropna(), lags=20, ax=axes[0])
axes[0].set_title("Autocorrelation Function (ACF)")

# PACF (for p selection)
sm.graphics.tsa.plot_pacf(series.diff().diff().dropna(), lags=20, ax=axes[1])
axes[1].set_title("Partial Autocorrelation Function (PACF)")

plt.show()

# Assuming 'series' is your Pandas Series with a DateTime index
train_size = int(len(series) * 0.8)  # 80% train, 20% holdout
train, holdout = series.iloc[:train_size], series.iloc[train_size:]

# Plot train/holdout split
plt.figure(figsize=(10, 4))
plt.plot(train, label="Training Data")
plt.plot(holdout, label="Holdout Data", color="red")
plt.title("Train-Holdout Split")
plt.legend()
plt.show()


tscv = TimeSeriesSplit(n_splits=5)  # 5 folds

p, d, q = 1, 2, 2  # Set based on ACF/PACF analysis
garch_p, garch_q = 1, 1  # GARCH hyperparameters

arima_errors = []
garch_volatility_forecasts = []

for train_idx, test_idx in tscv.split(train):
    train_fold, test_fold = train.iloc[train_idx], train.iloc[test_idx]

    # Train ARIMA model on expanding training set
    arima_model = ARIMA(train_fold, order=(p, d, q))
    arima_fit = arima_model.fit()

    # Forecast ARIMA on test fold
    arima_forecast = arima_fit.forecast(steps=len(test_fold))

    # Calculate MAE for ARIMA prediction
    error = mean_absolute_error(test_fold, arima_forecast)
    arima_errors.append(error)

    # Extract ARIMA residuals
    residuals = arima_fit.resid

    # Fit GARCH model on residuals
    garch_model = arch_model(residuals, vol='Garch', p=garch_p, q=garch_q)
    garch_fit = garch_model.fit(disp="off")

    # Forecast volatility for the test fold
    garch_forecast = garch_fit.forecast(start=len(residuals), horizon=len(test_fold))
    volatility_forecast = np.sqrt(garch_forecast.variance.iloc[-1])  # Extract volatility

    garch_volatility_forecasts.append(volatility_forecast)

# Print cross-validation errors
print(f"Average ARIMA MAE Across Folds: {np.mean(arima_errors)}")