File size: 9,624 Bytes
8cf4695
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287

import pandas as pd
from statsmodels.tsa.tsatools import freq_to_period
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import SlidingWindowSplitter, SingleWindowSplitter
from sktime.forecasting.model_selection import temporal_train_test_split

from typing import List
import logging

from .utils import split_x_y, k_folds
from .models import AllModels


class Forecaster():
    def __init__(
        self
    ) -> None:
        logging.debug('Forecaster init')

    def fit(
        self,
        data: pd.DataFrame,
        exog: pd.DataFrame = None,
        n_predict: int = 1,
        window_length: int = None,
        target_col: str = None
    ) -> None:
        '''
        data: pandas DataFrame, required
            Data must contains datetime index and y column. Any additional column will be considered
            as exogenous data and been used for multi variate forecasting

        window_length: int, optional
            if not given, window_length will be inferred from the seasonality period, at most 20, at least 4
            recommend to use the amount of auto correlation(AR) as window length

        exog: pandas DataFrame, optionsal
            Exogenous data, must contains datetime columns, must NOT contain y column, the datetime index must
            covers full range of training and forecasting data length == (training + n_predict)
        '''

        print('[Forecaster - fit] ----- START -----')
        self.freq: str = pd.infer_freq(data.index)
        self.period = freq_to_period(self.freq)
        self.window_length = window_length
        self.target_col = target_col

        # Handle forecast window, this wil be used to build models
        # Forecast window will be always smaller than 20, or 1 seasonality period
        if window_length is None:
            self.window_length = min(20, self.period,  int(len(data)*0.2))
            # Forecast window will not be smaller than 4
            self.window_length = max(4, self.window_length)
            print(f'Inferred window_length: {self.window_length}')

        self.data = data.copy()

        if data.index.freq is None:
            self.data.index.freq = self.freq
        # self.data.index = pd.to_timedelta(self.data.index, unit="MS")

        # if there is no columns other than 'y', self.X and self.X_future will be None
        (
            self.fh,
            self.y,
            self.X,
            self.X_future) = split_x_y(
            self.data,
            window_length,
            n_predict,
            self.freq)

        if exog is not None:
            print('[Forecaster - fit] - exogenous data provided')
            try:
                self.exog = exog.loc[self.X.index]
                self.exog_future = exog.loc[self.X_future.index]
            except Exception:
                raise ValueError(
                    'Exogenous value not fit, exogenous data must contains a datetime index and covers the entire train and forecast time range.')

            print('[Forecaster - fit] - merge exogenous data with features')
            self.X = pd.concat([self.exog, self.X], axis=1)
            self.X_future = pd.concat(
                [self.exog_future, self.X_future], axis=1)

        # ---------------- #
        # Train Test Split #
        # ---------------- #
        print('[Forecaster - fit] Train test split')
        test_size = len(self.fh)

        print('[Forecaster - fit] Test size: ', test_size)
        if self.X is not None:
            (
                self.y_train,
                self.y_test,
                self.X_train,
                self.X_test
            ) = temporal_train_test_split(
                self.y,
                self.X,
                test_size=test_size)
        else:
            (
                self.y_train,
                self.y_test,
            ) = temporal_train_test_split(
                self.y,
                test_size=test_size)
        self.fh_test = ForecastingHorizon(self.y_test.index, is_relative=False)

        # ---------------- #
        # Cross Validation #
        # ---------------- #

        print(
            f'[Forecaster - fit] Single window splitter, with window_length {len(self.y) + test_size} and fh {test_size}')
        self.cv = SingleWindowSplitter(
            window_length=len(self.y) + test_size,
            fh=test_size
        )

        # ----- END [Train Test Split] ----- #

        # Originally wanted to create my own k-fold validation, but realised this doesn't work well with sktime API
        # self.k_folds = k_folds(
        #     data,
        #     self.period,
        #     window_length,
        #     n_predict,
        #     self.freq)

        print('[Forecaster - fit] ----- END -----')

    def forecast(
            self,
            models: str or List[str] = 'all',
            test: bool = False,
    ):
        # ----------- #
        # Init Models #
        # ----------- #
        logging.debug('Init models')
        all_models = AllModels()

        self.models = all_models.init_models(models)

        results = []

        for m in self.models:

            model_name = m['name']

            m['model'].fit(
                self.y_train if test else self.y,
                self.cv,
                self.window_length,
                self.X_train if test else self.X)

            results.append({
                'model': model_name,
                'results': m['model'].forecast(
                    self.fh_test if test else self.fh,
                    self.X_test if test else self.X_future
                )
            })

        self.results = results

        return results

    def forecast__old(
        self,
        data: pd.DataFrame,
        n_predict: int,
        models: str or List[str] = 'all',
        window_length: int = None
    ) -> None:
        '''
        data: pandas DataFrame, required
            Data must contains datetime index and y column. Any additional column will be considered
            as exogenous data and been used for multi variate forecasting

        window_length: int, optional
            if not given, window_length will be inferred from the seasonality period, at most 20, at least 4
            recommend to use the amount of auto correlation(AR) as window length
        '''
        self.data = data
        logging.debug('Fitting data')

        datetime_index = data.index

        y = data[[self.target_col]].reset_index(drop=True)

        freq: str = pd.infer_freq(datetime_index)

        period = freq_to_period(freq)

        self.window_length = window_length

        # Handle forecast window, this wil be used to build models
        # Forecast window will be always smaller than 20, or 1 seasonality period
        if window_length is None:
            self.window_length = min(20, period,  int(len(data)*0.2))
            # Forecast window will not be smaller than 4
            self.window_length = max(4, self.window_length)

        # ----------------------- #
        # Handling exogenous data #
        # ----------------------- #
        exog, exog_columns, exog_train, exog_pred = None, None, None, None
        if len(data.columns) > 1:
            logging.debug('Exogenous found')
            exog = data.drop(columns=self.target_col).reset_index(drop=True)
            exog_columns = exog.columns
            exog_train = exog.copy()

            # Build lags of the exog data
            logging.debug('Building lags of exog data')
            for n in range(1, self.window_length+1):

                shifted_columns = {}
                for col in exog_columns:
                    shifted_columns[col] = f'{col}_-{n}'

                shifted = exog.shift(n).rename(columns=shifted_columns)

                exog_train = pd.concat(
                    [exog_train, shifted],
                    axis=1)

            logging.debug('Backward fill lags of exog data')
            exog_train = exog_train.bfill()

            # Split last n_predict rows from exog_train as exog_pred
            exog_pred = exog_train[-n_predict:]
            exog_train = exog_train[:-n_predict]

            # For both y and datetime index, need to cut off n_predict value to keep data consistent
            logging.debug('Cutting off y and datetime index be n_predict')
            y = y[n_predict:]
            datetime_index = datetime_index[n_predict:]

        # ----------- #
        # Init Models #
        # ----------- #
        logging.debug('Init models')
        all_models = AllModels()

        self.models = all_models.init_models(models)

        # Handle forecasting horizon
        fh = ForecastingHorizon(
            list(range(1, n_predict+1)), is_relative=True, freq=freq)
        # Cutoff is the last datetime value in the given data
        # meaning we'll forecast right after this point of time
        cutoff = datetime_index[-1]
        fh = fh.to_absolute(cutoff=cutoff)

        results = []

        # ----------------------- #
        # Fitting and Forecasting #
        # ----------------------- #

        for m in self.models:
            m['model'].fit(
                y,
                datetime_index,
                exog=exog_train,
                window_length=self.window_length)

            model_name = m['name']

            results.append({
                'model': model_name,
                'forecast': m['model'].forecast(fh, exog=exog_pred)
            })

        self.results = results

        # For testing
        self.exog_train = exog_train
        self.exog_pred = exog_pred