AlgoX commited on
Commit
69194ec
·
0 Parent(s):

feature engneering file

Browse files
Files changed (2) hide show
  1. .gitattributes +4 -0
  2. data_prep/features.py +326 -0
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ checkpoints_classical/run_20251022_160934/SARIMAX_model.pkl filter=lfs diff=lfs merge=lfs -text
2
+ *.pt filter=lfs diff=lfs merge=lfs -text
3
+ *.pkl filter=lfs diff=lfs merge=lfs -text
4
+ *.png filter=lfs diff=lfs merge=lfs -text
data_prep/features.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def _check_price_cols(df):
5
+ required = ["Open", "High", "Low", "Close", "Volume"]
6
+ missing = [c for c in required if c not in df.columns]
7
+ if missing:
8
+ raise ValueError(f"Missing required columns: {missing}")
9
+
10
+
11
+ def _rma(series: pd.Series, n: int):
12
+ """Wilder's moving average (RMA)."""
13
+
14
+ series = series.copy().astype(float)
15
+
16
+ out = pd.Series(np.nan, index=series.index)
17
+
18
+ if len(series) < n:
19
+ return out
20
+
21
+ out.iloc[n - 1] = series.iloc[:n].mean()
22
+
23
+ alpha = 1.0 / n
24
+
25
+ for i in range(n, len(series)):
26
+ out.iat[i] = out.iat[i - 1] * (1 - alpha) + series.iat[i] * alpha
27
+
28
+ return out
29
+
30
+
31
+ def _true_range(df):
32
+ tr1 = df["High"] - df["Low"]
33
+ tr2 = (df["High"] - df["Close"].shift(1)).abs()
34
+ tr3 = (df["Low"] - df["Close"].shift(1)).abs()
35
+ tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
36
+ return tr
37
+
38
+
39
+ def add_daily_return(df: pd.DataFrame) -> pd.DataFrame:
40
+ """
41
+ Add Daily_Return and Log_Return
42
+ """
43
+ df = df.copy()
44
+
45
+ _check_price_cols(df)
46
+
47
+ cols = ["Open", "High", "Low", "Close", "Volume"]
48
+ df[cols] = df[cols].apply(pd.to_numeric, errors="coerce")
49
+
50
+ df["Daily_Return"] = df["Close"].pct_change()
51
+
52
+ df["Log_Return"] = np.log(df["Close"] / df["Close"].shift(1))
53
+
54
+ return df
55
+
56
+ def add_trend_indicators(
57
+ df: pd.DataFrame,
58
+ sma_periods=(5, 10, 20, 50, 100, 200),
59
+ ema_periods=(5, 10, 12, 20, 26, 50, 100, 200),
60
+ ) -> pd.DataFrame:
61
+ """
62
+ Add SMA, EMA, MACD (12/26) and MACD signal/hist
63
+ """
64
+ df = df.copy()
65
+ _check_price_cols(df)
66
+
67
+ for p in sma_periods:
68
+ df[f"SMA_{p}"] = df["Close"].rolling(window=p, min_periods=1).mean()
69
+
70
+ for p in ema_periods:
71
+ df[f"EMA_{p}"] = df["Close"].ewm(span=p, adjust=False).mean()
72
+
73
+ ema12 = df["Close"].ewm(span=12, adjust=False).mean()
74
+ ema26 = df["Close"].ewm(span=26, adjust=False).mean()
75
+
76
+
77
+ df["MACD"] = ema12 - ema26
78
+ df["MACD_Signal"] = df["MACD"].ewm(span=9, adjust=False).mean()
79
+ df["MACD_Hist"] = df["MACD"] - df["MACD_Signal"]
80
+
81
+ return df
82
+
83
+ def add_momentum_indicators(
84
+ df: pd.DataFrame, rsi_n=14, sto_k=14, sto_d=3, cci_n=20, roc_n=12
85
+ ) -> pd.DataFrame:
86
+ """
87
+ add common momentum indicators:
88
+ -RSI
89
+ -Stochastic %K/%D
90
+ -Williams %R
91
+ -CCI
92
+ -ROC
93
+ -Momentum
94
+ -CMO
95
+ -Ultimate Oscillator.
96
+ """
97
+ df = df.copy()
98
+ _check_price_cols(df)
99
+
100
+ close = df["Close"]
101
+
102
+ # RSI (Wilder smoothing)
103
+ delta = close.diff()
104
+ up = delta.clip(lower=0)
105
+ down = -delta.clip(upper=0)
106
+ df[f"RSI_{rsi_n}"] = 100 - (100 / (1 + (_rma(up, rsi_n) / _rma(down, rsi_n))))
107
+
108
+ # Stochastic %K and %D
109
+ low_min = df["Low"].rolling(window=sto_k).min()
110
+ high_max = df["High"].rolling(window=sto_k).max()
111
+ df["Sto_%K"] = 100 * (close - low_min) / (high_max - low_min)
112
+ df["Sto_%D"] = df["Sto_%K"].rolling(window=sto_d).mean()
113
+
114
+ # Williams %R
115
+ df[f"Williams_%R_{sto_k}"] = -100 * (high_max - close) / (high_max - low_min)
116
+
117
+ # CCI
118
+ tp = (df["High"] + df["Low"] + df["Close"]) / 3.0
119
+ sma_tp = tp.rolling(cci_n).mean()
120
+ mad = tp.rolling(cci_n).apply(lambda x: np.mean(np.abs(x - np.mean(x))), raw=True)
121
+ df[f"CCI_{cci_n}"] = (tp - sma_tp) / (0.015 * mad)
122
+
123
+ # ROC and Momentum
124
+ df[f"ROC_{roc_n}"] = close.pct_change(periods=roc_n)
125
+ df[f"Momentum_{roc_n}"] = close - close.shift(roc_n)
126
+
127
+ # CMO (Chande Momentum Oscillator)
128
+ up_sum = delta.clip(lower=0).rolling(rsi_n).sum()
129
+ down_sum = -delta.clip(upper=0).rolling(rsi_n).sum()
130
+ df[f"CMO_{rsi_n}"] = 100 * (up_sum - down_sum) / (up_sum + down_sum)
131
+
132
+ # Ultimate Oscillator (7,14,28 default)
133
+ def _ultimate_osc(df_, s1=7, s2=14, s3=28):
134
+ bp = df_["Close"] - df_[["Low", "Close"]].shift(1).min(axis=1)
135
+ tr = _true_range(df_)
136
+ avg1 = bp.rolling(s1).sum() / tr.rolling(s1).sum()
137
+ avg2 = bp.rolling(s2).sum() / tr.rolling(s2).sum()
138
+ avg3 = bp.rolling(s3).sum() / tr.rolling(s3).sum()
139
+ return 100 * ((4 * avg1) + (2 * avg2) + (1 * avg3)) / (4 + 2 + 1)
140
+
141
+ df["Ultimate_Osc"] = _ultimate_osc(df)
142
+ return df
143
+
144
+
145
+ def add_volume_indicators(df: pd.DataFrame) -> pd.DataFrame:
146
+ """
147
+ Add volume-based indicators:
148
+ = OBV
149
+ - CMF
150
+ - ADL
151
+ - VPT
152
+ - Volume Oscillator
153
+ - MFI
154
+ - Force Index.
155
+ """
156
+ df = df.copy()
157
+ _check_price_cols(df)
158
+
159
+ # OBV
160
+ df["OBV"] = ((np.sign(df["Close"].diff()) * df["Volume"]).fillna(0)).cumsum()
161
+ # Money Flow Multiplier and Chaikin Money Flow
162
+ mf_mult = ((df["Close"] - df["Low"]) - (df["High"] - df["Close"])) / (
163
+ df["High"] - df["Low"]
164
+ )
165
+
166
+ mf_mult = mf_mult.replace([np.inf, -np.inf], 0).fillna(0)
167
+ mf_volume = mf_mult * df["Volume"]
168
+
169
+ df["CMF_20"] = mf_volume.rolling(20).sum() / df["Volume"].rolling(20).sum()
170
+
171
+ # ADL
172
+ df["ADL"] = mf_volume.cumsum()
173
+
174
+ # VPT
175
+ df["VPT"] = (df["Volume"] * df["Close"].pct_change()).fillna(0).cumsum()
176
+
177
+ # Volume Oscillator (short=10, long=20)
178
+ df["VO_short_10"] = df["Volume"].rolling(10).mean()
179
+
180
+ df["VO_long_20"] = df["Volume"].rolling(20).mean()
181
+
182
+ df["Volume_Osc"] = (df["VO_short_10"] - df["VO_long_20"]) / df["VO_long_20"]
183
+
184
+ # MFI (Money Flow Index)
185
+ tp = (df["High"] + df["Low"] + df["Close"]) / 3.0
186
+ mf = tp * df["Volume"]
187
+ pos_mf = mf.where(tp > tp.shift(1), 0)
188
+ neg_mf = mf.where(tp < tp.shift(1), 0)
189
+ df["MFI_14"] = 100 - (
190
+ 100 / (1 + (pos_mf.rolling(14).sum() / neg_mf.rolling(14).sum()))
191
+ )
192
+
193
+ # Force Index
194
+ df["ForceIndex_1"] = df["Close"].diff(1) * df["Volume"]
195
+ df["ForceIndex_EMA_13"] = df["ForceIndex_1"].ewm(span=13, adjust=False).mean()
196
+ return df
197
+
198
+
199
+ def add_volatility_indicators(df: pd.DataFrame) -> pd.DataFrame:
200
+ """
201
+ Add
202
+ - ATR
203
+ - Bollinger Bands
204
+ - Donchian channels
205
+ """
206
+ df = df.copy()
207
+ _check_price_cols(df)
208
+
209
+ tr = _true_range(df)
210
+
211
+ df["TR"] = tr
212
+ df["ATR_14"] = tr.rolling(14).mean()
213
+
214
+ # Bollinger Bands (20,2)
215
+ n = 20
216
+ df["BB_MID_20"] = df["Close"].rolling(n).mean()
217
+ df["BB_STD_20"] = df["Close"].rolling(n).std()
218
+ df["BB_UPPER_20"] = df["BB_MID_20"] + 2 * df["BB_STD_20"]
219
+ df["BB_LOWER_20"] = df["BB_MID_20"] - 2 * df["BB_STD_20"]
220
+ df["BB_Width"] = (df["BB_UPPER_20"] - df["BB_LOWER_20"]) / df["BB_MID_20"]
221
+
222
+ # Donchian (20)
223
+ dc = 20
224
+ df["Donchian_High_20"] = df["High"].rolling(dc).max()
225
+ df["Donchian_Low_20"] = df["Low"].rolling(dc).min()
226
+ df["Donchian_Mid_20"] = (df["Donchian_High_20"] + df["Donchian_Low_20"]) / 2.0
227
+ return df
228
+
229
+
230
+ # made by chatgpt because I have no idea how formula for these even works
231
+ def add_hybrid_indicators(df: pd.DataFrame) -> pd.DataFrame:
232
+ """
233
+ Add VWAP (cumulative), Heikin-Ashi candles, ADX, Aroon, Vortex.
234
+ Each is implemented in a focused manner.
235
+ """
236
+ df = df.copy()
237
+ _check_price_cols(df)
238
+ # VWAP cumulative
239
+ tp = (df["High"] + df["Low"] + df["Close"]) / 3.0
240
+ cum_vp = (tp * df["Volume"]).cumsum()
241
+ cum_vol = df["Volume"].cumsum().replace(0, np.nan)
242
+ df["VWAP_cum"] = cum_vp / cum_vol
243
+
244
+ # Heikin-Ashi
245
+ ha_close = (df["Open"] + df["High"] + df["Low"] + df["Close"]) / 4.0
246
+ ha_open = ha_close.copy()
247
+ if len(ha_open) > 0:
248
+ ha_open.iloc[0] = (df["Open"].iloc[0] + df["Close"].iloc[0]) / 2.0
249
+ for i in range(1, len(ha_open)):
250
+ ha_open.iat[i] = (ha_open.iat[i - 1] + ha_close.iat[i - 1]) / 2.0
251
+ df["HA_Open"] = ha_open
252
+ df["HA_Close"] = ha_close
253
+ df["HA_High"] = df[["High", "HA_Open", "HA_Close"]].max(axis=1)
254
+ df["HA_Low"] = df[["Low", "HA_Open", "HA_Close"]].min(axis=1)
255
+
256
+ # ADX (using DI sums approach)
257
+ def _adx(df_, n=14):
258
+ up_move = df_["High"].diff()
259
+ down_move = -df_["Low"].diff()
260
+ plus_dm = np.where((up_move > down_move) & (up_move > 0), up_move, 0.0)
261
+ minus_dm = np.where((down_move > up_move) & (down_move > 0), down_move, 0.0)
262
+ tr = _true_range(df_)
263
+ atr = tr.rolling(n).mean()
264
+ plus_dm_sm = pd.Series(plus_dm, index=df_.index).rolling(window=n).sum()
265
+ minus_dm_sm = pd.Series(minus_dm, index=df_.index).rolling(window=n).sum()
266
+ plus_di = 100 * (plus_dm_sm / atr)
267
+ minus_di = 100 * (minus_dm_sm / atr)
268
+ dx = (abs(plus_di - minus_di) / (plus_di + minus_di)) * 100
269
+ adx = dx.rolling(n).mean()
270
+ return plus_di, minus_di, adx
271
+
272
+ df["+DI_14"], df["-DI_14"], df["ADX_14"] = _adx(df, 14)
273
+
274
+ # Aroon (n=25)
275
+ def _aroon(df_, n=25):
276
+ # Aroon up/down in percentage. This implementation uses rolling apply.
277
+ def single_aroon_up(arr):
278
+ # arr is an array of highs in the window
279
+ idx = np.argmax(arr)
280
+ periods_since_high = (len(arr) - 1) - idx
281
+ return ((n - periods_since_high) / n) * 100.0
282
+
283
+ def single_aroon_down(arr):
284
+ idx = np.argmin(arr)
285
+ periods_since_low = (len(arr) - 1) - idx
286
+ return ((n - periods_since_low) / n) * 100.0
287
+
288
+ aroon_up = df_["High"].rolling(window=n).apply(single_aroon_up, raw=True)
289
+ aroon_down = df_["Low"].rolling(window=n).apply(single_aroon_down, raw=True)
290
+ return aroon_up, aroon_down
291
+
292
+ try:
293
+ df["Aroon_Up_25"], df["Aroon_Down_25"] = _aroon(df, 25)
294
+ except Exception:
295
+ df["Aroon_Up_25"] = np.nan
296
+ df["Aroon_Down_25"] = np.nan
297
+
298
+ # Vortex Indicator
299
+ def _vortex(df_, n=14):
300
+ tr = _true_range(df_)
301
+ trn = tr.rolling(n).sum()
302
+ vmp = (df_["High"] - df_["Low"].shift(1)).abs().rolling(n).sum()
303
+ vmm = (df_["Low"] - df_["High"].shift(1)).abs().rolling(n).sum()
304
+ vip = vmp / trn
305
+ vim = vmm / trn
306
+ return vip, vim
307
+
308
+ df["Vortex_Pos_14"], df["Vortex_Neg_14"] = _vortex(df, 14)
309
+
310
+ return df
311
+
312
+
313
+ def add_all_indicators(df: pd.DataFrame) -> pd.DataFrame:
314
+ """
315
+ Convenience wrapper that runs all modular functions in a safe order.
316
+ """
317
+ df = df.copy()
318
+ df = add_daily_return(df)
319
+ df = add_trend_indicators(df)
320
+ df = add_momentum_indicators(df)
321
+ df = add_volume_indicators(df)
322
+ df = add_volatility_indicators(df)
323
+ df = add_hybrid_indicators(df)
324
+ # cleanup infinities
325
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
326
+ return df