Riy777 commited on
Commit
bcb4fc4
·
verified ·
1 Parent(s): 1245491

Update backtest_engine.py

Browse files
Files changed (1) hide show
  1. backtest_engine.py +77 -43
backtest_engine.py CHANGED
@@ -1,5 +1,5 @@
1
  # ============================================================
2
- # 🧪 backtest_engine.py (V135.1 - GEM-Architect: EMA20 Fix)
3
  # ============================================================
4
 
5
  import asyncio
@@ -17,7 +17,7 @@ import traceback
17
  from numpy.lib.stride_tricks import sliding_window_view
18
  from datetime import datetime, timezone
19
  from typing import Dict, Any, List
20
- from scipy.special import expit # Sigmoid
21
 
22
  try:
23
  from ml_engine.processor import MLProcessor, SystemLimits
@@ -38,9 +38,11 @@ CACHE_DIR = "backtest_real_scores"
38
  # ============================================================
39
  def sanitize_features(df):
40
  if df is None or df.empty: return df
41
- return df.replace([np.inf, -np.inf], np.nan).fillna(0.0)
 
42
 
43
  def _z_roll(x, w=500):
 
44
  r = x.rolling(w).mean()
45
  s = x.rolling(w).std().replace(0, np.nan)
46
  return ((x - r) / s).fillna(0)
@@ -49,12 +51,27 @@ def _revive_score_distribution(scores):
49
  scores = np.array(scores, dtype=np.float32)
50
  if len(scores) < 10: return scores
51
  std = np.std(scores)
 
52
  if std < 0.05:
53
  mean = np.mean(scores)
54
  z = (scores - mean) / (std + 1e-9)
55
  return expit(z)
56
  return scores
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # ============================================================
59
  # 🧪 THE BACKTESTER CLASS
60
  # ============================================================
@@ -83,7 +100,7 @@ class HeavyDutyBacktester:
83
  self.force_end_date = None
84
 
85
  if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR)
86
- print(f"🧪 [Backtest V135.1] Feature Parity + Full Diagnostics + Speed.")
87
 
88
  def set_date_range(self, start_str, end_str):
89
  self.force_start_date = start_str
@@ -129,58 +146,67 @@ class HeavyDutyBacktester:
129
  return df.values.tolist()
130
 
131
  # ==============================================================
132
- # 🏎️ VECTORIZED INDICATORS (EXACT MATCH TO LIVE SYSTEM)
133
  # ==============================================================
134
  def _calculate_indicators_vectorized(self, df, timeframe='1m'):
135
  # 1. Clean Types
136
  cols = ['close', 'high', 'low', 'volume', 'open']
137
  for c in cols: df[c] = df[c].astype(np.float64)
138
 
 
 
 
 
139
  # ---------------------------------------------------------
140
  # 🧠 PART 1: TITAN FEATURES
141
  # ---------------------------------------------------------
142
- df['RSI'] = ta.rsi(df['close'], length=14).fillna(50)
143
 
 
144
  macd = ta.macd(df['close'])
145
  if macd is not None:
146
- df['MACD'] = macd.iloc[:, 0].fillna(0)
147
- df['MACD_h'] = macd.iloc[:, 1].fillna(0)
148
  else:
149
  df['MACD'] = 0.0; df['MACD_h'] = 0.0
150
 
151
- df['CCI'] = ta.cci(df['high'], df['low'], df['close'], length=20).fillna(0)
152
 
153
  adx = ta.adx(df['high'], df['low'], df['close'], length=14)
154
- if adx is not None: df['ADX'] = adx.iloc[:, 0].fillna(0)
155
  else: df['ADX'] = 0.0
156
 
157
- # Titan uses 9, 21, 50, 200
158
  for p in [9, 21, 50, 200]:
159
- ema = ta.ema(df['close'], length=p)
160
- df[f'EMA_{p}_dist'] = ((df['close'] / ema) - 1).fillna(0)
 
161
  df[f'ema{p}'] = ema
162
 
163
- # [GEM-FIX] Explicitly calculate EMA20 for Legacy models
164
- df['ema20'] = ta.ema(df['close'], length=20).fillna(df['close'])
165
 
166
  bb = ta.bbands(df['close'], length=20, std=2.0)
167
  if bb is not None:
168
- df['BB_w'] = ((bb.iloc[:, 2] - bb.iloc[:, 0]) / bb.iloc[:, 1]).fillna(0)
169
- df['BB_p'] = ((df['close'] - bb.iloc[:, 0]) / (bb.iloc[:, 2] - bb.iloc[:, 0])).fillna(0)
170
- df['bb_width'] = df['BB_w']
 
 
 
 
 
171
 
172
- df['MFI'] = ta.mfi(df['high'], df['low'], df['close'], df['volume'], length=14).fillna(50)
173
 
174
  vwap = ta.vwap(df['high'], df['low'], df['close'], df['volume'])
175
  if vwap is not None:
176
- df['VWAP_dist'] = ((df['close'] / vwap) - 1).fillna(0)
177
  df['vwap'] = vwap
178
  else:
179
  df['VWAP_dist'] = 0.0
180
  df['vwap'] = df['close']
181
 
182
- df['atr'] = ta.atr(df['high'], df['low'], df['close'], length=14).fillna(0)
183
- df['atr_pct'] = df['atr'] / df['close']
184
 
185
  # ---------------------------------------------------------
186
  # 🎯 PART 2: SNIPER FEATURES (1m Only)
@@ -192,10 +218,12 @@ class HeavyDutyBacktester:
192
  df['return_15m'] = df['close'].pct_change(15).fillna(0)
193
 
194
  df['rsi_14'] = df['RSI']
195
- df['ema_9_slope'] = ((df['ema9'] - df['ema9'].shift(1)) / df['ema9'].shift(1)).fillna(0)
 
 
196
  df['ema_21_dist'] = df['EMA_21_dist']
197
 
198
- atr_100 = ta.atr(df['high'], df['low'], df['close'], length=100).fillna(0)
199
  df['atr_z'] = _z_roll(atr_100)
200
 
201
  df['vol_zscore_50'] = _z_roll(df['volume'], 50)
@@ -210,19 +238,19 @@ class HeavyDutyBacktester:
210
 
211
  dp = df['close'].diff()
212
  roll_cov = dp.rolling(64).cov(dp.shift(1))
213
- roll_spread_raw = (2 * np.sqrt(np.maximum(0, -roll_cov)))
214
  df['roll_spread'] = _z_roll(roll_spread_raw)
215
 
216
  sign = np.sign(df['close'].diff()).fillna(0)
217
  signed_vol = sign * df['volume']
218
- ofi_raw = signed_vol.rolling(30).sum()
219
  df['ofi'] = _z_roll(ofi_raw)
220
 
221
  buy_vol = (sign > 0) * df['volume']
222
  sell_vol = (sign < 0) * df['volume']
223
  imb = (buy_vol.rolling(60).sum() - sell_vol.rolling(60).sum()).abs()
224
- tot = df['volume'].rolling(60).sum()
225
- df['vpin'] = (imb / tot.replace(0, np.nan)).fillna(0)
226
 
227
  vwap_win = 20
228
  v_short = (df['dollar_vol'].rolling(vwap_win).sum() / df['volume'].rolling(vwap_win).sum().replace(0, np.nan)).fillna(df['close'])
@@ -236,29 +264,32 @@ class HeavyDutyBacktester:
236
  # ---------------------------------------------------------
237
  # 🧠 PART 3: ORACLE / HYDRA / LEGACY EXTRAS
238
  # ---------------------------------------------------------
239
- df['slope'] = ta.slope(df['close'], length=7).fillna(0)
240
  vol_mean = df['volume'].rolling(20).mean()
241
- vol_std = df['volume'].rolling(20).std()
242
- df['vol_z'] = ((df['volume'] - vol_mean) / (vol_std + 1e-9)).fillna(0)
243
 
244
  df['rel_vol'] = df['volume'] / (df['volume'].rolling(50).mean() + 1e-9)
245
 
246
- df['log_ret'] = np.log(df['close'] / df['close'].shift(1)).fillna(0)
247
  roll_max = df['high'].rolling(50).max()
248
  roll_min = df['low'].rolling(50).min()
249
  diff = (roll_max - roll_min).replace(0, 1e-9)
250
  df['fib_pos'] = ((df['close'] - roll_min) / diff).fillna(0.5)
251
 
252
- # Now 'ema20' exists!
253
- df['trend_slope'] = ((df['ema20'] - df['ema20'].shift(5)) / df['ema20'].shift(5)).fillna(0)
254
- df['volatility'] = (df['atr'] / df['close']).fillna(0)
255
 
256
  fib618 = roll_max - (diff * 0.382)
257
- df['dist_fib618'] = ((df['close'] - fib618) / df['close']).fillna(0)
 
 
 
258
 
259
- df['dist_ema50'] = (df['close'] - df['ema50']) / df['close']
260
- df['ema200'] = ta.ema(df['close'], length=200)
261
- df['dist_ema200'] = (df['close'] - df['ema200']) / df['close']
262
 
263
  if timeframe == '1m':
264
  for lag in [1, 2, 3, 5, 10, 20]:
@@ -267,11 +298,12 @@ class HeavyDutyBacktester:
267
  df[f'fib_pos_lag_{lag}'] = df['fib_pos'].shift(lag).fillna(0.5)
268
  df[f'volatility_lag_{lag}'] = df['volatility'].shift(lag).fillna(0)
269
 
 
270
  df.fillna(0, inplace=True)
271
  return df
272
 
273
  # ==============================================================
274
- # 🧠 CPU PROCESSING (GLOBAL INFERENCE + FULL FEATURE PARITY)
275
  # ==============================================================
276
  async def _process_data_in_memory(self, sym, candles, start_ms, end_ms):
277
  safe_sym = sym.replace('/', '_')
@@ -282,7 +314,7 @@ class HeavyDutyBacktester:
282
  print(f" 📂 [{sym}] Data Exists -> Skipping.")
283
  return
284
 
285
- print(f" ⚙️ [CPU] Analyzing {sym} (Full Stack / High Fidelity)...", flush=True)
286
  t0 = time.time()
287
 
288
  df_1m = pd.DataFrame(candles, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
@@ -307,7 +339,7 @@ class HeavyDutyBacktester:
307
  frames[tf_str] = resampled
308
  numpy_htf[tf_str] = {col: resampled[col].values for col in resampled.columns}
309
 
310
- # 3. Global Index Maps
311
  arr_ts_1m = fast_1m['timestamp']
312
  map_5m = np.clip(np.searchsorted(numpy_htf['5m']['timestamp'], arr_ts_1m), 0, len(numpy_htf['5m']['timestamp']) - 1)
313
  map_15m = np.clip(np.searchsorted(numpy_htf['15m']['timestamp'], arr_ts_1m), 0, len(numpy_htf['15m']['timestamp']) - 1)
@@ -344,7 +376,10 @@ class HeavyDutyBacktester:
344
  else:
345
  t_vecs.append(np.zeros(len(arr_ts_1m)))
346
 
 
347
  X_TITAN = np.column_stack(t_vecs)
 
 
348
  preds_t = titan_model.predict(xgb.DMatrix(X_TITAN))
349
  global_titan_scores = _revive_score_distribution(preds_t)
350
  except Exception as e: print(f"Titan Error: {e}")
@@ -485,7 +520,6 @@ class HeavyDutyBacktester:
485
  l2_arr = np.full(240, 0.7)
486
  tgt_arr = np.full(240, 3.0)
487
 
488
- # [rsi1, rsi5, rsi15, bb, vol, dist_ema, atr_p, norm, max, dists, time, entry, oracle, l2, target]
489
  X_H = np.column_stack([
490
  sl_st[:,0], sl_st[:,1], sl_st[:,2], sl_st[:,3], sl_st[:,4],
491
  zeros, atr_pct, norm_pnl, max_pnl_r,
 
1
  # ============================================================
2
+ # 🧪 backtest_engine.py (V136.0 - GEM-Architect: Data Integrity Fixed)
3
  # ============================================================
4
 
5
  import asyncio
 
17
  from numpy.lib.stride_tricks import sliding_window_view
18
  from datetime import datetime, timezone
19
  from typing import Dict, Any, List
20
+ from scipy.special import expit
21
 
22
  try:
23
  from ml_engine.processor import MLProcessor, SystemLimits
 
38
  # ============================================================
39
  def sanitize_features(df):
40
  if df is None or df.empty: return df
41
+ # Use ffill/bfill first to preserve trends, then 0 only as last resort
42
+ return df.replace([np.inf, -np.inf], np.nan).ffill().bfill().fillna(0.0)
43
 
44
  def _z_roll(x, w=500):
45
+ if not isinstance(x, pd.Series): x = pd.Series(x)
46
  r = x.rolling(w).mean()
47
  s = x.rolling(w).std().replace(0, np.nan)
48
  return ((x - r) / s).fillna(0)
 
51
  scores = np.array(scores, dtype=np.float32)
52
  if len(scores) < 10: return scores
53
  std = np.std(scores)
54
+ # If standard deviation is extremely low, it means model is outputting constant 'dead' values
55
  if std < 0.05:
56
  mean = np.mean(scores)
57
  z = (scores - mean) / (std + 1e-9)
58
  return expit(z)
59
  return scores
60
 
61
+ # ✅ [GEM-FIX] Smart Indicator Wrapper (No more Zeros)
62
+ def safe_ta(ind_output, index, fill_method='smart'):
63
+ if ind_output is None:
64
+ return pd.Series(0.0, index=index, dtype='float64')
65
+
66
+ if not isinstance(ind_output, pd.Series):
67
+ s = pd.Series(ind_output, index=index)
68
+ else:
69
+ s = ind_output
70
+
71
+ # Smart Fill: Backfill first (for warmup), then Forward fill
72
+ s = s.bfill().ffill()
73
+ return s.fillna(0.0).astype('float64')
74
+
75
  # ============================================================
76
  # 🧪 THE BACKTESTER CLASS
77
  # ============================================================
 
100
  self.force_end_date = None
101
 
102
  if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR)
103
+ print(f"🧪 [Backtest V136.0] Data Integrity Edition (Smart-Fill Active).")
104
 
105
  def set_date_range(self, start_str, end_str):
106
  self.force_start_date = start_str
 
146
  return df.values.tolist()
147
 
148
  # ==============================================================
149
+ # 🏎️ VECTORIZED INDICATORS (SMART FILL)
150
  # ==============================================================
151
  def _calculate_indicators_vectorized(self, df, timeframe='1m'):
152
  # 1. Clean Types
153
  cols = ['close', 'high', 'low', 'volume', 'open']
154
  for c in cols: df[c] = df[c].astype(np.float64)
155
 
156
+ # Ensure no gaps in price before calc
157
+ df[cols] = df[cols].ffill().bfill()
158
+ idx = df.index
159
+
160
  # ---------------------------------------------------------
161
  # 🧠 PART 1: TITAN FEATURES
162
  # ---------------------------------------------------------
163
+ df['RSI'] = safe_ta(ta.rsi(df['close'], length=14), idx)
164
 
165
+ # MACD
166
  macd = ta.macd(df['close'])
167
  if macd is not None:
168
+ df['MACD'] = safe_ta(macd.iloc[:, 0], idx)
169
+ df['MACD_h'] = safe_ta(macd.iloc[:, 1], idx)
170
  else:
171
  df['MACD'] = 0.0; df['MACD_h'] = 0.0
172
 
173
+ df['CCI'] = safe_ta(ta.cci(df['high'], df['low'], df['close'], length=20), idx)
174
 
175
  adx = ta.adx(df['high'], df['low'], df['close'], length=14)
176
+ if adx is not None: df['ADX'] = safe_ta(adx.iloc[:, 0], idx)
177
  else: df['ADX'] = 0.0
178
 
 
179
  for p in [9, 21, 50, 200]:
180
+ ema = safe_ta(ta.ema(df['close'], length=p), idx)
181
+ # Use replace(0, np.nan) to avoid Infinity
182
+ df[f'EMA_{p}_dist'] = ((df['close'] / ema.replace(0, np.nan)) - 1).fillna(0)
183
  df[f'ema{p}'] = ema
184
 
185
+ df['ema20'] = safe_ta(ta.ema(df['close'], length=20), idx)
 
186
 
187
  bb = ta.bbands(df['close'], length=20, std=2.0)
188
  if bb is not None:
189
+ # Width
190
+ w = ((bb.iloc[:, 2] - bb.iloc[:, 0]) / bb.iloc[:, 1].replace(0, np.nan)).fillna(0)
191
+ # %B
192
+ p = ((df['close'] - bb.iloc[:, 0]) / (bb.iloc[:, 2] - bb.iloc[:, 0]).replace(0, np.nan)).fillna(0)
193
+ df['BB_w'] = w; df['BB_p'] = p
194
+ df['bb_width'] = w
195
+ else:
196
+ df['BB_w'] = 0; df['BB_p'] = 0; df['bb_width'] = 0
197
 
198
+ df['MFI'] = safe_ta(ta.mfi(df['high'], df['low'], df['close'], df['volume'], length=14), idx)
199
 
200
  vwap = ta.vwap(df['high'], df['low'], df['close'], df['volume'])
201
  if vwap is not None:
202
+ df['VWAP_dist'] = ((df['close'] / vwap.replace(0, np.nan)) - 1).fillna(0)
203
  df['vwap'] = vwap
204
  else:
205
  df['VWAP_dist'] = 0.0
206
  df['vwap'] = df['close']
207
 
208
+ df['atr'] = safe_ta(ta.atr(df['high'], df['low'], df['close'], length=14), idx)
209
+ df['atr_pct'] = (df['atr'] / df['close'].replace(0, np.nan)).fillna(0)
210
 
211
  # ---------------------------------------------------------
212
  # 🎯 PART 2: SNIPER FEATURES (1m Only)
 
218
  df['return_15m'] = df['close'].pct_change(15).fillna(0)
219
 
220
  df['rsi_14'] = df['RSI']
221
+
222
+ e9 = df['ema9'].replace(0, np.nan)
223
+ df['ema_9_slope'] = ((df['ema9'] - df['ema9'].shift(1)) / e9.shift(1)).fillna(0)
224
  df['ema_21_dist'] = df['EMA_21_dist']
225
 
226
+ atr_100 = safe_ta(ta.atr(df['high'], df['low'], df['close'], length=100), idx)
227
  df['atr_z'] = _z_roll(atr_100)
228
 
229
  df['vol_zscore_50'] = _z_roll(df['volume'], 50)
 
238
 
239
  dp = df['close'].diff()
240
  roll_cov = dp.rolling(64).cov(dp.shift(1))
241
+ roll_spread_raw = (2 * np.sqrt(np.maximum(0, -roll_cov))).fillna(0)
242
  df['roll_spread'] = _z_roll(roll_spread_raw)
243
 
244
  sign = np.sign(df['close'].diff()).fillna(0)
245
  signed_vol = sign * df['volume']
246
+ ofi_raw = signed_vol.rolling(30).sum().fillna(0)
247
  df['ofi'] = _z_roll(ofi_raw)
248
 
249
  buy_vol = (sign > 0) * df['volume']
250
  sell_vol = (sign < 0) * df['volume']
251
  imb = (buy_vol.rolling(60).sum() - sell_vol.rolling(60).sum()).abs()
252
+ tot = df['volume'].rolling(60).sum().replace(0, np.nan)
253
+ df['vpin'] = (imb / tot).fillna(0)
254
 
255
  vwap_win = 20
256
  v_short = (df['dollar_vol'].rolling(vwap_win).sum() / df['volume'].rolling(vwap_win).sum().replace(0, np.nan)).fillna(df['close'])
 
264
  # ---------------------------------------------------------
265
  # 🧠 PART 3: ORACLE / HYDRA / LEGACY EXTRAS
266
  # ---------------------------------------------------------
267
+ df['slope'] = safe_ta(ta.slope(df['close'], length=7), idx)
268
  vol_mean = df['volume'].rolling(20).mean()
269
+ vol_std = df['volume'].rolling(20).std().replace(0, np.nan)
270
+ df['vol_z'] = ((df['volume'] - vol_mean) / vol_std).fillna(0)
271
 
272
  df['rel_vol'] = df['volume'] / (df['volume'].rolling(50).mean() + 1e-9)
273
 
274
+ df['log_ret'] = np.log(df['close'] / df['close'].shift(1).replace(0, np.nan)).fillna(0)
275
  roll_max = df['high'].rolling(50).max()
276
  roll_min = df['low'].rolling(50).min()
277
  diff = (roll_max - roll_min).replace(0, 1e-9)
278
  df['fib_pos'] = ((df['close'] - roll_min) / diff).fillna(0.5)
279
 
280
+ e20_s = df['ema20'].shift(5).replace(0, np.nan)
281
+ df['trend_slope'] = ((df['ema20'] - df['ema20'].shift(5)) / e20_s).fillna(0)
282
+ df['volatility'] = (df['atr'] / df['close'].replace(0, np.nan)).fillna(0)
283
 
284
  fib618 = roll_max - (diff * 0.382)
285
+ df['dist_fib618'] = ((df['close'] - fib618) / df['close'].replace(0, np.nan)).fillna(0)
286
+
287
+ e50 = df['ema50'].replace(0, np.nan)
288
+ df['dist_ema50'] = ((df['close'] - df['ema50']) / e50).fillna(0)
289
 
290
+ e200 = safe_ta(ta.ema(df['close'], length=200), idx) # Safe Fill
291
+ df['ema200'] = e200
292
+ df['dist_ema200'] = ((df['close'] - e200) / e200.replace(0, np.nan)).fillna(0)
293
 
294
  if timeframe == '1m':
295
  for lag in [1, 2, 3, 5, 10, 20]:
 
298
  df[f'fib_pos_lag_{lag}'] = df['fib_pos'].shift(lag).fillna(0.5)
299
  df[f'volatility_lag_{lag}'] = df['volatility'].shift(lag).fillna(0)
300
 
301
+ # FINAL SANITIZATION
302
  df.fillna(0, inplace=True)
303
  return df
304
 
305
  # ==============================================================
306
+ # 🧠 CPU PROCESSING
307
  # ==============================================================
308
  async def _process_data_in_memory(self, sym, candles, start_ms, end_ms):
309
  safe_sym = sym.replace('/', '_')
 
314
  print(f" 📂 [{sym}] Data Exists -> Skipping.")
315
  return
316
 
317
+ print(f" ⚙️ [CPU] Analyzing {sym}...", flush=True)
318
  t0 = time.time()
319
 
320
  df_1m = pd.DataFrame(candles, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
 
339
  frames[tf_str] = resampled
340
  numpy_htf[tf_str] = {col: resampled[col].values for col in resampled.columns}
341
 
342
+ # 3. Global Maps
343
  arr_ts_1m = fast_1m['timestamp']
344
  map_5m = np.clip(np.searchsorted(numpy_htf['5m']['timestamp'], arr_ts_1m), 0, len(numpy_htf['5m']['timestamp']) - 1)
345
  map_15m = np.clip(np.searchsorted(numpy_htf['15m']['timestamp'], arr_ts_1m), 0, len(numpy_htf['15m']['timestamp']) - 1)
 
376
  else:
377
  t_vecs.append(np.zeros(len(arr_ts_1m)))
378
 
379
+ # Check mean to ensure data isn't all zeros
380
  X_TITAN = np.column_stack(t_vecs)
381
+ # print(f" [DEBUG] Titan Input Mean: {np.mean(X_TITAN):.4f}")
382
+
383
  preds_t = titan_model.predict(xgb.DMatrix(X_TITAN))
384
  global_titan_scores = _revive_score_distribution(preds_t)
385
  except Exception as e: print(f"Titan Error: {e}")
 
520
  l2_arr = np.full(240, 0.7)
521
  tgt_arr = np.full(240, 3.0)
522
 
 
523
  X_H = np.column_stack([
524
  sl_st[:,0], sl_st[:,1], sl_st[:,2], sl_st[:,3], sl_st[:,4],
525
  zeros, atr_pct, norm_pnl, max_pnl_r,