Arkm20 commited on
Commit
86b656d
·
verified ·
1 Parent(s): 00db226

Create features.py

Browse files
Files changed (1) hide show
  1. src/features.py +286 -0
src/features.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ features.py — Sniper v7.1 feature engineering & label construction.
3
+ This is the single source of truth used by both the backtester and evaluator.
4
+ Ported directly from sniper_v7_1.py training code.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Feature engineering (mirrors sniper_v7_1.py build_features exactly)
13
+ # ---------------------------------------------------------------------------
14
+
15
+ def build_features(df: pd.DataFrame, vix_data=None, sp500_data=None) -> pd.DataFrame:
16
+ """
17
+ Build all 100+ technical features from OHLCV data.
18
+ Inputs must have columns: Open, High, Low, Close, Volume.
19
+ Returns a DataFrame of features (shifted 1 day to prevent lookahead).
20
+ """
21
+ feat = pd.DataFrame(index=df.index)
22
+ c = df["Close"]
23
+ h = df["High"]
24
+ l = df["Low"]
25
+ o = df["Open"]
26
+ v = df["Volume"]
27
+ daily_ret = c.pct_change()
28
+
29
+ # --- Exhaustion / Mean-reversion signals ---
30
+ down = (c < c.shift(1)).astype(int)
31
+ feat["consec_down_days"] = down.groupby((down != down.shift()).cumsum()).cumsum()
32
+ up = (c > c.shift(1)).astype(int)
33
+ feat["consec_up_days"] = up.groupby((up != up.shift()).cumsum()).cumsum()
34
+
35
+ for n in [5, 10, 20, 50]:
36
+ feat[f"dist_from_{n}d_low"] = (c - l.rolling(n).min()) / c
37
+ feat[f"dist_from_{n}d_high"] = (h.rolling(n).max() - c) / c
38
+
39
+ feat["vol_ratio_5d"] = v / v.rolling(5).mean()
40
+ feat["vol_ratio_20d"] = v / v.rolling(20).mean()
41
+
42
+ for n in [3, 5, 10]:
43
+ feat[f"drawdown_{n}d"] = (c / c.rolling(n).max()) - 1
44
+
45
+ feat["sell_climax_5d"] = daily_ret.rolling(5).min() * feat["vol_ratio_5d"]
46
+
47
+ # --- Oscillators ---
48
+ delta = c.diff()
49
+ gain14 = delta.where(delta > 0, 0.0).rolling(14).mean()
50
+ loss14 = (-delta.where(delta < 0, 0.0)).rolling(14).mean()
51
+ rs14 = gain14 / loss14.replace(0, np.nan)
52
+ feat["rsi_14"] = 100 - (100 / (1 + rs14))
53
+
54
+ gain7 = delta.where(delta > 0, 0.0).rolling(7).mean()
55
+ loss7 = (-delta.where(delta < 0, 0.0)).rolling(7).mean()
56
+ rs7 = gain7 / loss7.replace(0, np.nan)
57
+ feat["rsi_7"] = 100 - (100 / (1 + rs7))
58
+
59
+ low14 = l.rolling(14).min()
60
+ high14 = h.rolling(14).max()
61
+ rng14 = (high14 - low14).replace(0, np.nan)
62
+ feat["stoch_k"] = 100 * (c - low14) / rng14
63
+ feat["stoch_d"] = feat["stoch_k"].rolling(3).mean()
64
+ feat["williams_r"] = -100 * (high14 - c) / rng14
65
+
66
+ tp = (h + l + c) / 3
67
+ sma_tp = tp.rolling(20).mean()
68
+ mad = tp.rolling(20).apply(lambda x: np.mean(np.abs(x - x.mean())), raw=True)
69
+ feat["cci_20"] = (tp - sma_tp) / (0.015 * mad).replace(0, np.nan)
70
+
71
+ ema12 = c.ewm(span=12).mean()
72
+ ema26 = c.ewm(span=26).mean()
73
+ macd_line = ema12 - ema26
74
+ signal_line = macd_line.ewm(span=9).mean()
75
+ feat["macd_hist"] = macd_line - signal_line
76
+ feat["macd_hist_norm"] = feat["macd_hist"] / c
77
+
78
+ mf = tp * v
79
+ pos_mf = mf.where(tp > tp.shift(1), 0).rolling(14).sum()
80
+ neg_mf = mf.where(tp <= tp.shift(1), 0).rolling(14).sum()
81
+ feat["mfi_14"] = 100 - (100 / (1 + pos_mf / neg_mf.replace(0, np.nan)))
82
+
83
+ feat["rsi_div_5d"] = (
84
+ (feat["rsi_14"] - feat["rsi_14"].rolling(5).min())
85
+ - (c - c.rolling(5).min()) / c * 100
86
+ )
87
+
88
+ # --- Volume / OBV ---
89
+ obv = (np.sign(daily_ret) * v).cumsum()
90
+ feat["obv_slope_10d"] = obv.pct_change(10)
91
+ feat["obv_slope_20d"] = obv.pct_change(20)
92
+
93
+ # --- Volatility ---
94
+ tr = pd.concat(
95
+ [h - l, (h - c.shift(1)).abs(), (l - c.shift(1)).abs()], axis=1
96
+ ).max(axis=1)
97
+ feat["atr_14"] = tr.rolling(14).mean()
98
+ feat["atr_ratio"] = feat["atr_14"] / c
99
+
100
+ for n in [5, 10, 20, 60]:
101
+ feat[f"hvol_{n}d"] = daily_ret.rolling(n).std() * np.sqrt(252)
102
+
103
+ feat["vol_contraction"] = feat["hvol_5d"] / feat["hvol_20d"].replace(0, np.nan)
104
+ feat["vol_contraction_long"] = feat["hvol_10d"] / feat["hvol_60d"].replace(0, np.nan)
105
+
106
+ sma20 = c.rolling(20).mean()
107
+ std20 = c.rolling(20).std()
108
+ bb_upper = sma20 + 2 * std20
109
+ bb_lower = sma20 - 2 * std20
110
+ feat["bb_width"] = (bb_upper - bb_lower) / sma20
111
+ feat["bb_pctb"] = (c - bb_lower) / (bb_upper - bb_lower).replace(0, np.nan)
112
+
113
+ kc_mid = c.ewm(span=20).mean()
114
+ kc_upper = kc_mid + 1.5 * feat["atr_14"]
115
+ kc_lower = kc_mid - 1.5 * feat["atr_14"]
116
+ feat["keltner_pos"] = (c - kc_lower) / (kc_upper - kc_lower).replace(0, np.nan)
117
+ feat["squeeze"] = ((bb_lower > kc_lower) & (bb_upper < kc_upper)).astype(int)
118
+
119
+ for n in [5, 10, 20]:
120
+ feat[f"range_pct_{n}d"] = (h.rolling(n).max() - l.rolling(n).min()) / c
121
+
122
+ if vix_data is not None:
123
+ vix_aligned = vix_data.reindex(df.index, method="ffill")
124
+ feat["rv_iv_ratio"] = (feat["hvol_20d"] * 100) / vix_aligned.replace(0, np.nan)
125
+
126
+ # --- Returns ---
127
+ for n in [1, 2, 3, 5, 10, 20, 60]:
128
+ feat[f"ret_{n}d"] = c.pct_change(n)
129
+
130
+ # --- Trend / Price structure ---
131
+ sma50 = c.rolling(50).mean()
132
+ for n in [5, 10, 20, 50, 200]:
133
+ sma = c.rolling(n).mean()
134
+ feat[f"dist_sma_{n}"] = (c - sma) / sma
135
+
136
+ for n in [8, 21, 55]:
137
+ ema = c.ewm(span=n).mean()
138
+ feat[f"dist_ema_{n}"] = (c - ema) / ema
139
+
140
+ feat["sma50_slope"] = sma50.pct_change(5)
141
+ feat["sma20_slope"] = sma20.pct_change(5)
142
+ feat["above_sma200"] = (c > c.rolling(200).mean()).astype(int)
143
+ feat["above_sma50"] = (c > sma50).astype(int)
144
+ feat["gap"] = (o - c.shift(1)) / c.shift(1)
145
+
146
+ body = (c - o).abs()
147
+ total_range = (h - l).replace(0, np.nan)
148
+ feat["body_ratio"] = body / total_range
149
+ feat["upper_wick_ratio"] = (h - pd.concat([c, o], axis=1).max(axis=1)) / total_range
150
+ feat["lower_wick_ratio"] = (pd.concat([c, o], axis=1).min(axis=1) - l) / total_range
151
+
152
+ # --- Lagged signals ---
153
+ for lag in [1, 2, 3, 5, 10, 21]:
154
+ feat[f"ret_1d_lag{lag}"] = daily_ret.shift(max(0, lag - 1))
155
+
156
+ for lag in [1, 5, 10]:
157
+ feat[f"vol_ratio_lag{lag}"] = feat["vol_ratio_20d"].shift(max(0, lag - 1))
158
+
159
+ for lag in [1, 3, 5]:
160
+ feat[f"rsi_lag{lag}"] = feat["rsi_14"].shift(max(0, lag - 1))
161
+
162
+ feat["mean_rev_5d"] = feat["ret_5d"] * (feat["rsi_14"] < 30).astype(float)
163
+ feat["autocorr_5d"] = daily_ret.rolling(20).apply(
164
+ lambda x: x.autocorr(lag=5) if len(x) > 5 else 0.0, raw=False
165
+ )
166
+
167
+ # --- External / Market context ---
168
+ if vix_data is not None:
169
+ vix_aligned = vix_data.reindex(df.index, method="ffill")
170
+ feat["vix"] = vix_aligned
171
+ feat["vix_ma10"] = vix_aligned.rolling(10).mean()
172
+ feat["vix_pctile"] = vix_aligned.rolling(252).rank(pct=True)
173
+ feat["vix_change_5d"] = vix_aligned.pct_change(5)
174
+ feat["vix_term_structure"] = vix_aligned / vix_aligned.rolling(20).mean()
175
+
176
+ if sp500_data is not None:
177
+ sp_aligned = sp500_data.reindex(df.index, method="ffill")
178
+ sp_ret = sp_aligned.pct_change()
179
+ feat["sp500_ret_5d"] = sp_aligned.pct_change(5)
180
+ feat["sp500_ret_20d"] = sp_aligned.pct_change(20)
181
+ feat["sp500_above_sma200"] = (sp_aligned > sp_aligned.rolling(200).mean()).astype(int)
182
+ feat["sp500_hvol_20d"] = sp_ret.rolling(20).std() * np.sqrt(252)
183
+ feat["market_breadth_proxy"] = (
184
+ feat.get("sp500_ret_5d", pd.Series(0, index=df.index))
185
+ - feat.get("ret_5d", pd.Series(0, index=df.index))
186
+ )
187
+
188
+ # Shift 1 to prevent lookahead leakage
189
+ feat = feat.shift(1)
190
+ return feat
191
+
192
+
193
+ # ---------------------------------------------------------------------------
194
+ # Label construction (mirrors sniper_v7_1.py construct_labels exactly)
195
+ # ---------------------------------------------------------------------------
196
+
197
+ def construct_labels(
198
+ df: pd.DataFrame,
199
+ pt_multiplier: float = 3.0,
200
+ sl_multiplier: float = 0.5,
201
+ atr_period: int = 20,
202
+ horizon: int = 15,
203
+ use_time_weight: bool = True,
204
+ time_weight_decay: float = 0.80,
205
+ ) -> tuple:
206
+ """
207
+ Dual-barrier label construction.
208
+ Returns (labels Series, time_weights Series).
209
+ label = 1 if PT hit before SL within horizon days, else 0.
210
+ Last `horizon` rows are masked as -1.
211
+ """
212
+ c = df["Close"].values
213
+ h = df["High"].values
214
+ l = df["Low"].values
215
+
216
+ tr = np.maximum(
217
+ h[1:] - l[1:],
218
+ np.maximum(np.abs(h[1:] - c[:-1]), np.abs(l[1:] - c[:-1])),
219
+ )
220
+ atr = pd.Series(np.concatenate([[np.nan], tr])).rolling(atr_period).mean().values
221
+
222
+ n = len(c)
223
+ labels = np.zeros(n, dtype=int)
224
+ time_weights = np.ones(n, dtype=float)
225
+
226
+ for i in range(n - horizon):
227
+ if np.isnan(atr[i]) or atr[i] == 0:
228
+ continue
229
+ entry_price = c[i]
230
+ upper_barrier = entry_price + pt_multiplier * atr[i]
231
+ lower_barrier = entry_price - sl_multiplier * atr[i]
232
+
233
+ for j in range(1, horizon + 1):
234
+ if i + j >= n:
235
+ break
236
+ if l[i + j] <= lower_barrier:
237
+ break
238
+ if h[i + j] >= upper_barrier:
239
+ labels[i] = 1
240
+ if use_time_weight:
241
+ time_weights[i] = time_weight_decay ** (j - 1)
242
+ break
243
+
244
+ labels[-horizon:] = -1
245
+ return pd.Series(labels, index=df.index), pd.Series(time_weights, index=df.index)
246
+
247
+
248
+ # ---------------------------------------------------------------------------
249
+ # ATR helper (for live stop/target calculation in the backtester)
250
+ # ---------------------------------------------------------------------------
251
+
252
+ def compute_atr(df: pd.DataFrame, period: int = 14) -> pd.Series:
253
+ c = df["Close"]
254
+ h = df["High"]
255
+ l = df["Low"]
256
+ tr = pd.concat(
257
+ [h - l, (h - c.shift(1)).abs(), (l - c.shift(1)).abs()], axis=1
258
+ ).max(axis=1)
259
+ return tr.rolling(period).mean()
260
+
261
+
262
+ # ---------------------------------------------------------------------------
263
+ # Confluence scoring (bonus filter, same as trainer)
264
+ # ---------------------------------------------------------------------------
265
+
266
+ def compute_confluence(X: pd.DataFrame) -> pd.Series:
267
+ score = pd.Series(np.zeros(len(X)), index=X.index)
268
+
269
+ def _get(col, default):
270
+ return X[col] if col in X.columns else pd.Series(default, index=X.index)
271
+
272
+ checks = {
273
+ "RSI oversold": _get("rsi_14", 50) < 35,
274
+ "Stoch oversold": _get("stoch_k", 50) < 25,
275
+ "MFI oversold": _get("mfi_14", 50) < 30,
276
+ "Below BB lower": _get("bb_pctb", 0.5) < 0.1,
277
+ "Near SMA support": _get("dist_sma_20", 0) < -0.03,
278
+ "Volume spike": _get("vol_ratio_20d", 1) > 1.5,
279
+ "VIX elevated": _get("vix_pctile", 0.5) > 0.7,
280
+ "Consec down": _get("consec_down_days", 0) >= 3,
281
+ "Recent drawdown": _get("drawdown_5d", 0) < -0.05,
282
+ "Trend intact": _get("sma50_slope", 0) > 0,
283
+ }
284
+ for _, cond in checks.items():
285
+ score += cond.astype(float).fillna(0)
286
+ return score