Spaces:
Sleeping
Sleeping
Create features.py
Browse files- src/features.py +286 -0
src/features.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
features.py — Sniper v7.1 feature engineering & label construction.
|
| 3 |
+
This is the single source of truth used by both the backtester and evaluator.
|
| 4 |
+
Ported directly from sniper_v7_1.py training code.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
# Feature engineering (mirrors sniper_v7_1.py build_features exactly)
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
def build_features(df: pd.DataFrame, vix_data=None, sp500_data=None) -> pd.DataFrame:
|
| 16 |
+
"""
|
| 17 |
+
Build all 100+ technical features from OHLCV data.
|
| 18 |
+
Inputs must have columns: Open, High, Low, Close, Volume.
|
| 19 |
+
Returns a DataFrame of features (shifted 1 day to prevent lookahead).
|
| 20 |
+
"""
|
| 21 |
+
feat = pd.DataFrame(index=df.index)
|
| 22 |
+
c = df["Close"]
|
| 23 |
+
h = df["High"]
|
| 24 |
+
l = df["Low"]
|
| 25 |
+
o = df["Open"]
|
| 26 |
+
v = df["Volume"]
|
| 27 |
+
daily_ret = c.pct_change()
|
| 28 |
+
|
| 29 |
+
# --- Exhaustion / Mean-reversion signals ---
|
| 30 |
+
down = (c < c.shift(1)).astype(int)
|
| 31 |
+
feat["consec_down_days"] = down.groupby((down != down.shift()).cumsum()).cumsum()
|
| 32 |
+
up = (c > c.shift(1)).astype(int)
|
| 33 |
+
feat["consec_up_days"] = up.groupby((up != up.shift()).cumsum()).cumsum()
|
| 34 |
+
|
| 35 |
+
for n in [5, 10, 20, 50]:
|
| 36 |
+
feat[f"dist_from_{n}d_low"] = (c - l.rolling(n).min()) / c
|
| 37 |
+
feat[f"dist_from_{n}d_high"] = (h.rolling(n).max() - c) / c
|
| 38 |
+
|
| 39 |
+
feat["vol_ratio_5d"] = v / v.rolling(5).mean()
|
| 40 |
+
feat["vol_ratio_20d"] = v / v.rolling(20).mean()
|
| 41 |
+
|
| 42 |
+
for n in [3, 5, 10]:
|
| 43 |
+
feat[f"drawdown_{n}d"] = (c / c.rolling(n).max()) - 1
|
| 44 |
+
|
| 45 |
+
feat["sell_climax_5d"] = daily_ret.rolling(5).min() * feat["vol_ratio_5d"]
|
| 46 |
+
|
| 47 |
+
# --- Oscillators ---
|
| 48 |
+
delta = c.diff()
|
| 49 |
+
gain14 = delta.where(delta > 0, 0.0).rolling(14).mean()
|
| 50 |
+
loss14 = (-delta.where(delta < 0, 0.0)).rolling(14).mean()
|
| 51 |
+
rs14 = gain14 / loss14.replace(0, np.nan)
|
| 52 |
+
feat["rsi_14"] = 100 - (100 / (1 + rs14))
|
| 53 |
+
|
| 54 |
+
gain7 = delta.where(delta > 0, 0.0).rolling(7).mean()
|
| 55 |
+
loss7 = (-delta.where(delta < 0, 0.0)).rolling(7).mean()
|
| 56 |
+
rs7 = gain7 / loss7.replace(0, np.nan)
|
| 57 |
+
feat["rsi_7"] = 100 - (100 / (1 + rs7))
|
| 58 |
+
|
| 59 |
+
low14 = l.rolling(14).min()
|
| 60 |
+
high14 = h.rolling(14).max()
|
| 61 |
+
rng14 = (high14 - low14).replace(0, np.nan)
|
| 62 |
+
feat["stoch_k"] = 100 * (c - low14) / rng14
|
| 63 |
+
feat["stoch_d"] = feat["stoch_k"].rolling(3).mean()
|
| 64 |
+
feat["williams_r"] = -100 * (high14 - c) / rng14
|
| 65 |
+
|
| 66 |
+
tp = (h + l + c) / 3
|
| 67 |
+
sma_tp = tp.rolling(20).mean()
|
| 68 |
+
mad = tp.rolling(20).apply(lambda x: np.mean(np.abs(x - x.mean())), raw=True)
|
| 69 |
+
feat["cci_20"] = (tp - sma_tp) / (0.015 * mad).replace(0, np.nan)
|
| 70 |
+
|
| 71 |
+
ema12 = c.ewm(span=12).mean()
|
| 72 |
+
ema26 = c.ewm(span=26).mean()
|
| 73 |
+
macd_line = ema12 - ema26
|
| 74 |
+
signal_line = macd_line.ewm(span=9).mean()
|
| 75 |
+
feat["macd_hist"] = macd_line - signal_line
|
| 76 |
+
feat["macd_hist_norm"] = feat["macd_hist"] / c
|
| 77 |
+
|
| 78 |
+
mf = tp * v
|
| 79 |
+
pos_mf = mf.where(tp > tp.shift(1), 0).rolling(14).sum()
|
| 80 |
+
neg_mf = mf.where(tp <= tp.shift(1), 0).rolling(14).sum()
|
| 81 |
+
feat["mfi_14"] = 100 - (100 / (1 + pos_mf / neg_mf.replace(0, np.nan)))
|
| 82 |
+
|
| 83 |
+
feat["rsi_div_5d"] = (
|
| 84 |
+
(feat["rsi_14"] - feat["rsi_14"].rolling(5).min())
|
| 85 |
+
- (c - c.rolling(5).min()) / c * 100
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# --- Volume / OBV ---
|
| 89 |
+
obv = (np.sign(daily_ret) * v).cumsum()
|
| 90 |
+
feat["obv_slope_10d"] = obv.pct_change(10)
|
| 91 |
+
feat["obv_slope_20d"] = obv.pct_change(20)
|
| 92 |
+
|
| 93 |
+
# --- Volatility ---
|
| 94 |
+
tr = pd.concat(
|
| 95 |
+
[h - l, (h - c.shift(1)).abs(), (l - c.shift(1)).abs()], axis=1
|
| 96 |
+
).max(axis=1)
|
| 97 |
+
feat["atr_14"] = tr.rolling(14).mean()
|
| 98 |
+
feat["atr_ratio"] = feat["atr_14"] / c
|
| 99 |
+
|
| 100 |
+
for n in [5, 10, 20, 60]:
|
| 101 |
+
feat[f"hvol_{n}d"] = daily_ret.rolling(n).std() * np.sqrt(252)
|
| 102 |
+
|
| 103 |
+
feat["vol_contraction"] = feat["hvol_5d"] / feat["hvol_20d"].replace(0, np.nan)
|
| 104 |
+
feat["vol_contraction_long"] = feat["hvol_10d"] / feat["hvol_60d"].replace(0, np.nan)
|
| 105 |
+
|
| 106 |
+
sma20 = c.rolling(20).mean()
|
| 107 |
+
std20 = c.rolling(20).std()
|
| 108 |
+
bb_upper = sma20 + 2 * std20
|
| 109 |
+
bb_lower = sma20 - 2 * std20
|
| 110 |
+
feat["bb_width"] = (bb_upper - bb_lower) / sma20
|
| 111 |
+
feat["bb_pctb"] = (c - bb_lower) / (bb_upper - bb_lower).replace(0, np.nan)
|
| 112 |
+
|
| 113 |
+
kc_mid = c.ewm(span=20).mean()
|
| 114 |
+
kc_upper = kc_mid + 1.5 * feat["atr_14"]
|
| 115 |
+
kc_lower = kc_mid - 1.5 * feat["atr_14"]
|
| 116 |
+
feat["keltner_pos"] = (c - kc_lower) / (kc_upper - kc_lower).replace(0, np.nan)
|
| 117 |
+
feat["squeeze"] = ((bb_lower > kc_lower) & (bb_upper < kc_upper)).astype(int)
|
| 118 |
+
|
| 119 |
+
for n in [5, 10, 20]:
|
| 120 |
+
feat[f"range_pct_{n}d"] = (h.rolling(n).max() - l.rolling(n).min()) / c
|
| 121 |
+
|
| 122 |
+
if vix_data is not None:
|
| 123 |
+
vix_aligned = vix_data.reindex(df.index, method="ffill")
|
| 124 |
+
feat["rv_iv_ratio"] = (feat["hvol_20d"] * 100) / vix_aligned.replace(0, np.nan)
|
| 125 |
+
|
| 126 |
+
# --- Returns ---
|
| 127 |
+
for n in [1, 2, 3, 5, 10, 20, 60]:
|
| 128 |
+
feat[f"ret_{n}d"] = c.pct_change(n)
|
| 129 |
+
|
| 130 |
+
# --- Trend / Price structure ---
|
| 131 |
+
sma50 = c.rolling(50).mean()
|
| 132 |
+
for n in [5, 10, 20, 50, 200]:
|
| 133 |
+
sma = c.rolling(n).mean()
|
| 134 |
+
feat[f"dist_sma_{n}"] = (c - sma) / sma
|
| 135 |
+
|
| 136 |
+
for n in [8, 21, 55]:
|
| 137 |
+
ema = c.ewm(span=n).mean()
|
| 138 |
+
feat[f"dist_ema_{n}"] = (c - ema) / ema
|
| 139 |
+
|
| 140 |
+
feat["sma50_slope"] = sma50.pct_change(5)
|
| 141 |
+
feat["sma20_slope"] = sma20.pct_change(5)
|
| 142 |
+
feat["above_sma200"] = (c > c.rolling(200).mean()).astype(int)
|
| 143 |
+
feat["above_sma50"] = (c > sma50).astype(int)
|
| 144 |
+
feat["gap"] = (o - c.shift(1)) / c.shift(1)
|
| 145 |
+
|
| 146 |
+
body = (c - o).abs()
|
| 147 |
+
total_range = (h - l).replace(0, np.nan)
|
| 148 |
+
feat["body_ratio"] = body / total_range
|
| 149 |
+
feat["upper_wick_ratio"] = (h - pd.concat([c, o], axis=1).max(axis=1)) / total_range
|
| 150 |
+
feat["lower_wick_ratio"] = (pd.concat([c, o], axis=1).min(axis=1) - l) / total_range
|
| 151 |
+
|
| 152 |
+
# --- Lagged signals ---
|
| 153 |
+
for lag in [1, 2, 3, 5, 10, 21]:
|
| 154 |
+
feat[f"ret_1d_lag{lag}"] = daily_ret.shift(max(0, lag - 1))
|
| 155 |
+
|
| 156 |
+
for lag in [1, 5, 10]:
|
| 157 |
+
feat[f"vol_ratio_lag{lag}"] = feat["vol_ratio_20d"].shift(max(0, lag - 1))
|
| 158 |
+
|
| 159 |
+
for lag in [1, 3, 5]:
|
| 160 |
+
feat[f"rsi_lag{lag}"] = feat["rsi_14"].shift(max(0, lag - 1))
|
| 161 |
+
|
| 162 |
+
feat["mean_rev_5d"] = feat["ret_5d"] * (feat["rsi_14"] < 30).astype(float)
|
| 163 |
+
feat["autocorr_5d"] = daily_ret.rolling(20).apply(
|
| 164 |
+
lambda x: x.autocorr(lag=5) if len(x) > 5 else 0.0, raw=False
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# --- External / Market context ---
|
| 168 |
+
if vix_data is not None:
|
| 169 |
+
vix_aligned = vix_data.reindex(df.index, method="ffill")
|
| 170 |
+
feat["vix"] = vix_aligned
|
| 171 |
+
feat["vix_ma10"] = vix_aligned.rolling(10).mean()
|
| 172 |
+
feat["vix_pctile"] = vix_aligned.rolling(252).rank(pct=True)
|
| 173 |
+
feat["vix_change_5d"] = vix_aligned.pct_change(5)
|
| 174 |
+
feat["vix_term_structure"] = vix_aligned / vix_aligned.rolling(20).mean()
|
| 175 |
+
|
| 176 |
+
if sp500_data is not None:
|
| 177 |
+
sp_aligned = sp500_data.reindex(df.index, method="ffill")
|
| 178 |
+
sp_ret = sp_aligned.pct_change()
|
| 179 |
+
feat["sp500_ret_5d"] = sp_aligned.pct_change(5)
|
| 180 |
+
feat["sp500_ret_20d"] = sp_aligned.pct_change(20)
|
| 181 |
+
feat["sp500_above_sma200"] = (sp_aligned > sp_aligned.rolling(200).mean()).astype(int)
|
| 182 |
+
feat["sp500_hvol_20d"] = sp_ret.rolling(20).std() * np.sqrt(252)
|
| 183 |
+
feat["market_breadth_proxy"] = (
|
| 184 |
+
feat.get("sp500_ret_5d", pd.Series(0, index=df.index))
|
| 185 |
+
- feat.get("ret_5d", pd.Series(0, index=df.index))
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# Shift 1 to prevent lookahead leakage
|
| 189 |
+
feat = feat.shift(1)
|
| 190 |
+
return feat
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# ---------------------------------------------------------------------------
|
| 194 |
+
# Label construction (mirrors sniper_v7_1.py construct_labels exactly)
|
| 195 |
+
# ---------------------------------------------------------------------------
|
| 196 |
+
|
| 197 |
+
def construct_labels(
|
| 198 |
+
df: pd.DataFrame,
|
| 199 |
+
pt_multiplier: float = 3.0,
|
| 200 |
+
sl_multiplier: float = 0.5,
|
| 201 |
+
atr_period: int = 20,
|
| 202 |
+
horizon: int = 15,
|
| 203 |
+
use_time_weight: bool = True,
|
| 204 |
+
time_weight_decay: float = 0.80,
|
| 205 |
+
) -> tuple:
|
| 206 |
+
"""
|
| 207 |
+
Dual-barrier label construction.
|
| 208 |
+
Returns (labels Series, time_weights Series).
|
| 209 |
+
label = 1 if PT hit before SL within horizon days, else 0.
|
| 210 |
+
Last `horizon` rows are masked as -1.
|
| 211 |
+
"""
|
| 212 |
+
c = df["Close"].values
|
| 213 |
+
h = df["High"].values
|
| 214 |
+
l = df["Low"].values
|
| 215 |
+
|
| 216 |
+
tr = np.maximum(
|
| 217 |
+
h[1:] - l[1:],
|
| 218 |
+
np.maximum(np.abs(h[1:] - c[:-1]), np.abs(l[1:] - c[:-1])),
|
| 219 |
+
)
|
| 220 |
+
atr = pd.Series(np.concatenate([[np.nan], tr])).rolling(atr_period).mean().values
|
| 221 |
+
|
| 222 |
+
n = len(c)
|
| 223 |
+
labels = np.zeros(n, dtype=int)
|
| 224 |
+
time_weights = np.ones(n, dtype=float)
|
| 225 |
+
|
| 226 |
+
for i in range(n - horizon):
|
| 227 |
+
if np.isnan(atr[i]) or atr[i] == 0:
|
| 228 |
+
continue
|
| 229 |
+
entry_price = c[i]
|
| 230 |
+
upper_barrier = entry_price + pt_multiplier * atr[i]
|
| 231 |
+
lower_barrier = entry_price - sl_multiplier * atr[i]
|
| 232 |
+
|
| 233 |
+
for j in range(1, horizon + 1):
|
| 234 |
+
if i + j >= n:
|
| 235 |
+
break
|
| 236 |
+
if l[i + j] <= lower_barrier:
|
| 237 |
+
break
|
| 238 |
+
if h[i + j] >= upper_barrier:
|
| 239 |
+
labels[i] = 1
|
| 240 |
+
if use_time_weight:
|
| 241 |
+
time_weights[i] = time_weight_decay ** (j - 1)
|
| 242 |
+
break
|
| 243 |
+
|
| 244 |
+
labels[-horizon:] = -1
|
| 245 |
+
return pd.Series(labels, index=df.index), pd.Series(time_weights, index=df.index)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# ---------------------------------------------------------------------------
|
| 249 |
+
# ATR helper (for live stop/target calculation in the backtester)
|
| 250 |
+
# ---------------------------------------------------------------------------
|
| 251 |
+
|
| 252 |
+
def compute_atr(df: pd.DataFrame, period: int = 14) -> pd.Series:
|
| 253 |
+
c = df["Close"]
|
| 254 |
+
h = df["High"]
|
| 255 |
+
l = df["Low"]
|
| 256 |
+
tr = pd.concat(
|
| 257 |
+
[h - l, (h - c.shift(1)).abs(), (l - c.shift(1)).abs()], axis=1
|
| 258 |
+
).max(axis=1)
|
| 259 |
+
return tr.rolling(period).mean()
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
# ---------------------------------------------------------------------------
|
| 263 |
+
# Confluence scoring (bonus filter, same as trainer)
|
| 264 |
+
# ---------------------------------------------------------------------------
|
| 265 |
+
|
| 266 |
+
def compute_confluence(X: pd.DataFrame) -> pd.Series:
|
| 267 |
+
score = pd.Series(np.zeros(len(X)), index=X.index)
|
| 268 |
+
|
| 269 |
+
def _get(col, default):
|
| 270 |
+
return X[col] if col in X.columns else pd.Series(default, index=X.index)
|
| 271 |
+
|
| 272 |
+
checks = {
|
| 273 |
+
"RSI oversold": _get("rsi_14", 50) < 35,
|
| 274 |
+
"Stoch oversold": _get("stoch_k", 50) < 25,
|
| 275 |
+
"MFI oversold": _get("mfi_14", 50) < 30,
|
| 276 |
+
"Below BB lower": _get("bb_pctb", 0.5) < 0.1,
|
| 277 |
+
"Near SMA support": _get("dist_sma_20", 0) < -0.03,
|
| 278 |
+
"Volume spike": _get("vol_ratio_20d", 1) > 1.5,
|
| 279 |
+
"VIX elevated": _get("vix_pctile", 0.5) > 0.7,
|
| 280 |
+
"Consec down": _get("consec_down_days", 0) >= 3,
|
| 281 |
+
"Recent drawdown": _get("drawdown_5d", 0) < -0.05,
|
| 282 |
+
"Trend intact": _get("sma50_slope", 0) > 0,
|
| 283 |
+
}
|
| 284 |
+
for _, cond in checks.items():
|
| 285 |
+
score += cond.astype(float).fillna(0)
|
| 286 |
+
return score
|