P2SAMAPA commited on
Commit
5515a38
Β·
verified Β·
1 Parent(s): bb5e2b5

[auto] Sync code from GitHub

Browse files
Files changed (1) hide show
  1. evaluate.py +157 -174
evaluate.py CHANGED
@@ -1,14 +1,8 @@
1
  # evaluate.py
2
- # Backtests all 3 models on the test set.
3
- # Applies trailing stop loss (TSL) + Z-score re-entry logic.
4
- # Compares vs SPY and AGG buy-and-hold and AR(1) baseline.
5
- # Outputs evaluation_results.json
6
-
7
  import json
8
  import os
9
  import numpy as np
10
  import pandas as pd
11
- from scipy import stats
12
  from datetime import datetime
13
 
14
  import config
@@ -17,101 +11,84 @@ from preprocess import run_preprocessing
17
  import model_a, model_b, model_c
18
 
19
 
20
- # ─── Signal generator ─────────────────────────────────────────────────────────
21
 
22
- def raw_signals(model, prep: dict, is_dual: bool = False) -> np.ndarray:
23
- """
24
- Run model on test set.
25
- Returns predicted log-return matrix (N_test, 5).
26
- """
27
  X_te = prep["X_te"]
28
  if is_dual:
29
- n_etf = prep["n_etf_features"]
30
- inputs = [X_te[:, :, :n_etf], X_te[:, :, n_etf:]]
31
  else:
32
  inputs = X_te
33
  return model.predict(inputs, verbose=0) # (N, 5)
34
 
35
 
36
- def softmax_probs(preds: np.ndarray) -> np.ndarray:
37
- """Convert raw predicted returns to softmax probabilities."""
 
 
38
  e = np.exp(preds - preds.max(axis=1, keepdims=True))
39
- return e / e.sum(axis=1, keepdims=True)
40
 
41
 
42
- def z_score(probs: np.ndarray) -> np.ndarray:
43
  """
44
- Z-score: how many std devs is the top ETF prob above the mean of all ETF probs?
45
- Shape: (N,)
46
  """
47
- top = probs.max(axis=1)
48
- mu = probs.mean(axis=1)
49
- sigma = probs.std(axis=1) + 1e-8
50
- return (top - mu) / sigma
51
 
52
 
53
- # ─── TSL + Z-score backtest ───────────────────────────────────────────────────
54
 
55
- def backtest(probs: np.ndarray,
56
- dates: np.ndarray,
57
- etf_returns: pd.DataFrame,
58
- tbill_series: pd.Series,
59
- fee_bps: float = 10,
60
- tsl_pct: float = config.DEFAULT_TSL_PCT,
61
- z_reentry: float = config.DEFAULT_Z_REENTRY) -> pd.DataFrame:
62
  """
63
- Day-by-day backtest with:
64
- - Top ETF selected by argmax(probs)
65
- - Trailing stop loss: if 2-day cumulative return <= -tsl_pct β†’ CASH
66
- - CASH earns daily 3m T-bill rate
67
- - Re-enter ETF when Z >= z_reentry
68
-
69
- Returns DataFrame with columns:
70
- Date, Signal, Confidence, Z_Score, Mode,
71
- Gross_Return, Fee, Net_Return, Cumulative
72
  """
73
- n = len(probs)
74
- etf_idx = {e: i for i, e in enumerate(config.ETFS)}
75
-
76
  records = []
77
  in_cash = False
78
- prev_ret = 0.0 # previous day net return for 2-day rolling
79
- prev2_ret = 0.0 # two days ago
80
  last_signal= None
81
 
82
- for i in range(n):
83
- date = pd.Timestamp(dates[i])
84
- prob = probs[i]
85
- z = float(z_score(probs[i:i+1])[0])
86
- top_i = int(np.argmax(prob))
87
- etf = config.ETFS[top_i]
88
- conf = float(prob[top_i])
89
 
90
- # 2-day cumulative return check (prev_ret + prev2_ret)
91
- two_day_cumul = prev_ret + prev2_ret
92
 
93
- # Check TSL trigger
94
- if not in_cash and two_day_cumul <= -(tsl_pct / 100):
95
  in_cash = True
96
 
97
- # Check Z-score re-entry
98
  if in_cash and z >= z_reentry:
99
  in_cash = False
100
 
101
- # Get actual return for this date
102
  if date in etf_returns.index:
103
  if in_cash:
104
- # Earn daily T-bill
105
- tbill_rate = tbill_series.get(date, 3.6) / 100
106
- gross_ret = tbill_rate / 252
107
  fee = 0.0
108
  mode = "CASH"
109
  signal = "CASH"
110
  else:
111
  gross_ret = float(etf_returns.loc[date, etf]) \
112
  if etf in etf_returns.columns else 0.0
113
- # Transaction fee on signal change
114
- fee = (fee_bps / 10000) if etf != last_signal else 0.0
115
  gross_ret -= fee
116
  mode = "ETF"
117
  signal = etf
@@ -123,14 +100,14 @@ def backtest(probs: np.ndarray,
123
  signal = "CASH" if in_cash else etf
124
 
125
  records.append(dict(
126
- Date = date,
127
- Signal = signal,
128
- Confidence = round(conf, 4),
129
- Z_Score = round(z, 3),
130
- Mode = mode,
131
- Gross_Return= round(gross_ret, 6),
132
- Fee = round(fee, 6),
133
- Net_Return = round(gross_ret, 6),
134
  ))
135
 
136
  prev2_ret = prev_ret
@@ -143,65 +120,60 @@ def backtest(probs: np.ndarray,
143
 
144
  # ─── Performance metrics ──────────────────────────────────────────────────────
145
 
146
- def compute_metrics(bt: pd.DataFrame,
147
- bench_ret: pd.DataFrame,
148
- tbill_series: pd.Series) -> dict:
149
- rets = bt["Net_Return"].values
150
- dates = bt["Date"]
151
-
152
- # Annualised return
153
  n_days = len(rets)
154
- total = (1 + pd.Series(rets)).prod()
155
- ann_ret= (total ** (252 / n_days) - 1) * 100
156
 
157
- # Sharpe (excess over T-bill)
158
- tbill_daily = tbill_series.reindex(dates).ffill().fillna(3.6) / 100 / 252
159
- excess = rets - tbill_daily.values
160
- sharpe = (excess.mean() / (excess.std() + 1e-8)) * np.sqrt(252)
161
 
162
- # Max drawdown
163
- cum = np.cumprod(1 + rets)
164
- peak = np.maximum.accumulate(cum)
165
- dd = (cum - peak) / peak
166
- max_dd= float(dd.min()) * 100
167
 
168
- # Max daily DD
 
 
 
169
  max_daily_dd = float(rets.min()) * 100
170
 
171
- # Hit ratio (15-day rolling direction)
172
- signs = np.sign(rets)
173
- hit_15 = pd.Series(signs).rolling(15).apply(
174
- lambda x: (x > 0).mean()).mean()
175
 
176
- # Benchmark SPY ann return (same test period)
177
- spy_dates = bench_ret.index.intersection(dates)
178
- spy_rets = bench_ret.loc[spy_dates, "SPY"].values if "SPY" in bench_ret.columns else np.zeros(1)
179
- spy_total = (1 + pd.Series(spy_rets)).prod()
180
- spy_ann = (spy_total ** (252 / max(len(spy_rets), 1)) - 1) * 100
 
 
 
 
181
 
182
  return dict(
183
- ann_return = round(float(ann_ret), 2),
184
- sharpe = round(float(sharpe), 3),
185
- hit_ratio_15d = round(float(hit_15), 3),
186
- max_drawdown = round(float(max_dd), 2),
187
- max_daily_dd = round(float(max_daily_dd), 2),
188
- vs_spy = round(float(ann_ret - spy_ann), 2),
 
189
  )
190
 
191
 
192
  # ─── AR(1) baseline ───────────────────────────────────────────────────────────
193
 
194
- def ar1_backtest(etf_returns: pd.DataFrame,
195
- test_dates: np.ndarray) -> pd.DataFrame:
196
- """Naive AR(1): predict next return = lag-1 return, pick best ETF."""
197
- records = []
198
- dates_set = set(pd.to_datetime(test_dates))
199
- df = etf_returns[etf_returns.index.isin(dates_set)].copy()
200
  prev = df.shift(1).fillna(0)
201
  for date, row in df.iterrows():
202
- best_etf = prev.loc[date].idxmax()
203
- ret = float(row[best_etf])
204
- records.append(dict(Date=date, Signal=best_etf, Net_Return=ret))
205
  out = pd.DataFrame(records)
206
  out["Cumulative"] = (1 + out["Net_Return"]).cumprod()
207
  return out
@@ -209,121 +181,132 @@ def ar1_backtest(etf_returns: pd.DataFrame,
209
 
210
  # ─── Full evaluation ──────────────────────────────────────────────────────────
211
 
212
- def run_evaluation(tsl_pct: float = config.DEFAULT_TSL_PCT,
213
- z_reentry: float = config.DEFAULT_Z_REENTRY,
214
- fee_bps: float = 10) -> dict:
215
 
216
  print(f"\n{'='*60}")
217
- print(f" Evaluation β€” TSL={tsl_pct}% Z-reentry={z_reentry}Οƒ")
 
218
  print(f"{'='*60}")
219
 
220
  data = load_local()
221
  if not data:
222
  raise RuntimeError("No data. Run data_download.py first.")
223
 
224
- # Load training summary for best lookbacks
 
 
 
 
 
 
 
 
 
 
 
225
  summary_path = os.path.join(config.MODELS_DIR, "training_summary.json")
 
226
  if os.path.exists(summary_path):
227
  with open(summary_path) as f:
228
- summary = json.load(f)
229
- lb_a = summary.get("model_a", {}).get("best_lookback", 30)
230
- lb_b = summary.get("model_b", {}).get("best_lookback", 30)
231
- lb_c = summary.get("model_c", {}).get("best_lookback", 30)
232
- else:
233
- lb_a = lb_b = lb_c = config.DEFAULT_LOOKBACK
234
-
235
- tbill = data["macro"]["TBILL_3M"] if "TBILL_3M" in data["macro"].columns \
236
- else pd.Series(3.6, index=data["macro"].index)
237
 
238
  results = {}
239
 
240
- for tag, module, lb, is_dual in [
241
- ("model_a", model_a, lb_a, False),
242
- ("model_b", model_b, lb_b, False),
243
- ("model_c", model_c, lb_c, True),
244
  ]:
 
245
  print(f"\n Evaluating {tag.upper()} (lb={lb}d)...")
246
- prep = run_preprocessing(data, lb)
247
 
248
  try:
249
- m = module.load_model(lb)
250
  except Exception as e:
251
  print(f" Could not load {tag}: {e}")
252
  continue
253
 
254
- preds = raw_signals(m, prep, is_dual=is_dual)
255
- probs = softmax_probs(preds)
 
 
 
256
 
257
- bt = backtest(probs, prep["d_te"],
258
- data["etf_ret"][config.ETFS],
259
- tbill,
260
- fee_bps=fee_bps,
261
- tsl_pct=tsl_pct,
262
  z_reentry=z_reentry)
263
 
264
- metrics = compute_metrics(bt, data["bench_ret"], tbill)
 
 
 
 
265
  results[tag] = dict(
266
- metrics = metrics,
267
- lookback = lb,
268
- audit_tail = bt.tail(20).to_dict(orient="records"),
269
- all_signals= bt.to_dict(orient="records"),
270
  )
271
- print(f" Ann Return: {metrics['ann_return']}% "
272
- f"Sharpe: {metrics['sharpe']} "
273
- f"MaxDD: {metrics['max_drawdown']}%")
 
274
 
275
  # AR(1) baseline
276
- ar1_bt = ar1_backtest(data["etf_ret"][config.ETFS],
277
- run_preprocessing(data, 30)["d_te"])
278
  ar1_rets = ar1_bt["Net_Return"].values
279
- n_days = len(ar1_rets)
280
- ar1_ann = ((1 + pd.Series(ar1_rets)).prod() ** (252/n_days) - 1) * 100
281
  results["ar1_baseline"] = dict(ann_return=round(float(ar1_ann), 2))
282
 
283
- # Benchmark metrics (buy & hold, same test period)
284
  for bench in config.BENCHMARKS:
285
- test_dates = run_preprocessing(data, 30)["d_te"]
286
- b_dates = data["bench_ret"].index.intersection(pd.to_datetime(test_dates))
287
- b_rets = data["bench_ret"].loc[b_dates, bench].values \
288
- if bench in data["bench_ret"].columns else np.zeros(1)
289
  b_total = (1 + pd.Series(b_rets)).prod()
290
- b_ann = (b_total ** (252 / max(len(b_rets), 1)) - 1) * 100
291
- b_sharpe = (b_rets.mean() / (b_rets.std() + 1e-8)) * np.sqrt(252)
292
  b_cum = np.cumprod(1 + b_rets)
293
  b_peak = np.maximum.accumulate(b_cum)
294
- b_mdd = float(((b_cum - b_peak) / b_peak).min()) * 100
295
  results[bench] = dict(
296
  ann_return = round(float(b_ann), 2),
297
- sharpe = round(float(b_sharpe), 3),
298
  max_drawdown = round(float(b_mdd), 2),
299
  )
300
 
301
  # Winner
302
- model_keys = ["model_a", "model_b", "model_c"]
303
- valid_models= [k for k in model_keys if k in results]
304
- if valid_models:
305
- winner = max(valid_models,
306
  key=lambda k: results[k]["metrics"]["ann_return"])
307
  results["winner"] = winner
308
  print(f"\n ⭐ WINNER: {winner.upper()} "
309
- f"({results[winner]['metrics']['ann_return']}% ann. return)")
310
 
311
  results["evaluated_at"] = datetime.now().isoformat()
312
  results["tsl_pct"] = tsl_pct
313
  results["z_reentry"] = z_reentry
314
 
315
- out_path = "evaluation_results.json"
316
- with open(out_path, "w") as f:
317
  json.dump(results, f, indent=2, default=str)
318
- print(f"\n Results saved β†’ {out_path}")
319
  return results
320
 
321
 
322
  if __name__ == "__main__":
323
  import argparse
324
  parser = argparse.ArgumentParser()
325
- parser.add_argument("--tsl", type=float, default=config.DEFAULT_TSL_PCT)
326
- parser.add_argument("--z", type=float, default=config.DEFAULT_Z_REENTRY)
327
- parser.add_argument("--fee", type=float, default=10)
328
  args = parser.parse_args()
329
  run_evaluation(tsl_pct=args.tsl, z_reentry=args.z, fee_bps=args.fee)
 
1
  # evaluate.py
 
 
 
 
 
2
  import json
3
  import os
4
  import numpy as np
5
  import pandas as pd
 
6
  from datetime import datetime
7
 
8
  import config
 
11
  import model_a, model_b, model_c
12
 
13
 
14
+ # ─── Signal generation ────────────────────────────────────────────────────────
15
 
16
+ def raw_signals(model, prep, is_dual=False):
 
 
 
 
17
  X_te = prep["X_te"]
18
  if is_dual:
19
+ n = prep["n_etf_features"]
20
+ inputs = [X_te[:, :, :n], X_te[:, :, n:]]
21
  else:
22
  inputs = X_te
23
  return model.predict(inputs, verbose=0) # (N, 5)
24
 
25
 
26
+ def softmax_probs(preds):
27
+ """Row-wise softmax β†’ probabilities sum to 1."""
28
+ preds = np.array(preds)
29
+ # Subtract row max for numerical stability
30
  e = np.exp(preds - preds.max(axis=1, keepdims=True))
31
+ return e / e.sum(axis=1, keepdims=True) # (N, 5)
32
 
33
 
34
+ def compute_z_scores(probs):
35
  """
36
+ Per-row Z-score: how many std devs is the top ETF above the row mean?
37
+ Returns array of shape (N,)
38
  """
39
+ top = probs.max(axis=1) # (N,)
40
+ mu = probs.mean(axis=1) # (N,)
41
+ sigma = probs.std(axis=1) + 1e-8 # (N,)
42
+ return (top - mu) / sigma # (N,)
43
 
44
 
45
+ # ─── TSL backtest ─────────────────────────────────────────────────────────────
46
 
47
+ def backtest(probs, dates, etf_returns, tbill_series,
48
+ fee_bps=10, tsl_pct=10.0, z_reentry=1.1):
 
 
 
 
 
49
  """
50
+ Day-by-day backtest with proper TSL + Z-score re-entry logic.
51
+ tsl_pct: float e.g. 10.0 means trigger at -10% 2-day cumul
52
+ z_reentry: float e.g. 1.1 sigma
 
 
 
 
 
 
53
  """
54
+ z_scores = compute_z_scores(probs) # (N,) β€” one per day
 
 
55
  records = []
56
  in_cash = False
57
+ prev_ret = 0.0
58
+ prev2_ret = 0.0
59
  last_signal= None
60
 
61
+ for i in range(len(probs)):
62
+ date = pd.Timestamp(dates[i])
63
+ prob = probs[i] # (5,)
64
+ z = float(z_scores[i]) # scalar for this day
65
+ top_i = int(np.argmax(prob))
66
+ etf = config.ETFS[top_i]
67
+ conf = float(prob[top_i]) # already a probability 0-1
68
 
69
+ # 2-day cumulative return (previous 2 days)
70
+ two_day_cumul_pct = (prev_ret + prev2_ret) * 100
71
 
72
+ # ── TSL trigger ───────────────────────────────────────────────────────
73
+ if not in_cash and two_day_cumul_pct <= -tsl_pct:
74
  in_cash = True
75
 
76
+ # ── Z-score re-entry ──────────────────────────────────────────────────
77
  if in_cash and z >= z_reentry:
78
  in_cash = False
79
 
80
+ # ── Get actual return ─────────────────────────────────────────────────
81
  if date in etf_returns.index:
82
  if in_cash:
83
+ tbill_rate = float(tbill_series.get(date, 3.6))
84
+ gross_ret = (tbill_rate / 100) / 252
 
85
  fee = 0.0
86
  mode = "CASH"
87
  signal = "CASH"
88
  else:
89
  gross_ret = float(etf_returns.loc[date, etf]) \
90
  if etf in etf_returns.columns else 0.0
91
+ fee = (fee_bps / 10000) if etf != last_signal else 0.0
 
92
  gross_ret -= fee
93
  mode = "ETF"
94
  signal = etf
 
100
  signal = "CASH" if in_cash else etf
101
 
102
  records.append(dict(
103
+ Date = str(date.date()),
104
+ Signal = signal,
105
+ Confidence = round(conf, 4), # 0-1 float
106
+ Z_Score = round(z, 4), # per-day z
107
+ Two_Day_Cumul_Pct = round(two_day_cumul_pct, 2),
108
+ Mode = mode,
109
+ Net_Return = round(gross_ret, 6),
110
+ TSL_Triggered = in_cash,
111
  ))
112
 
113
  prev2_ret = prev_ret
 
120
 
121
  # ─── Performance metrics ──────────────────────────────────────────────────────
122
 
123
+ def compute_metrics(bt, bench_ret, tbill_series):
124
+ rets = bt["Net_Return"].values
125
+ dates = pd.to_datetime(bt["Date"])
 
 
 
 
126
  n_days = len(rets)
 
 
127
 
128
+ total = float((1 + pd.Series(rets)).prod())
129
+ ann_ret = (total ** (252 / n_days) - 1) * 100
 
 
130
 
131
+ tbill_daily = tbill_series.reindex(dates).ffill().fillna(3.6) / 100 / 252
132
+ excess = rets - tbill_daily.values
133
+ sharpe = float((excess.mean() / (excess.std() + 1e-8)) * np.sqrt(252))
 
 
134
 
135
+ cum = np.cumprod(1 + rets)
136
+ peak = np.maximum.accumulate(cum)
137
+ dd = (cum - peak) / peak
138
+ max_dd = float(dd.min()) * 100
139
  max_daily_dd = float(rets.min()) * 100
140
 
141
+ signs = np.sign(rets)
142
+ hit_15 = float(pd.Series(signs).rolling(15).apply(
143
+ lambda x: (x > 0).mean()).mean())
 
144
 
145
+ # SPY benchmark
146
+ spy_dates = bench_ret.index.intersection(dates)
147
+ spy_rets = bench_ret.loc[spy_dates, "SPY"].values \
148
+ if "SPY" in bench_ret.columns else np.zeros(1)
149
+ spy_total = float((1 + pd.Series(spy_rets)).prod())
150
+ spy_ann = (spy_total ** (252 / max(len(spy_rets), 1)) - 1) * 100
151
+
152
+ # CASH days count
153
+ cash_days = int((bt["Mode"] == "CASH").sum())
154
 
155
  return dict(
156
+ ann_return = round(ann_ret, 2),
157
+ sharpe = round(sharpe, 3),
158
+ hit_ratio_15d = round(hit_15, 3),
159
+ max_drawdown = round(max_dd, 2),
160
+ max_daily_dd = round(max_daily_dd, 2),
161
+ vs_spy = round(ann_ret - spy_ann, 2),
162
+ cash_days = cash_days,
163
  )
164
 
165
 
166
  # ─── AR(1) baseline ───────────────────────────────────────────────────────────
167
 
168
+ def ar1_backtest(etf_returns, test_dates):
169
+ records = []
170
+ dates_dt = pd.to_datetime(test_dates)
171
+ df = etf_returns[etf_returns.index.isin(dates_dt)].copy()
 
 
172
  prev = df.shift(1).fillna(0)
173
  for date, row in df.iterrows():
174
+ best = prev.loc[date].idxmax()
175
+ records.append(dict(Date=date, Signal=best,
176
+ Net_Return=float(row[best])))
177
  out = pd.DataFrame(records)
178
  out["Cumulative"] = (1 + out["Net_Return"]).cumprod()
179
  return out
 
181
 
182
  # ─── Full evaluation ──────────────────────────────────────────────────────────
183
 
184
+ def run_evaluation(tsl_pct=config.DEFAULT_TSL_PCT,
185
+ z_reentry=config.DEFAULT_Z_REENTRY,
186
+ fee_bps=10):
187
 
188
  print(f"\n{'='*60}")
189
+ print(f" Evaluation β€” TSL={tsl_pct}% Z-reentry={z_reentry}Οƒ "
190
+ f"Fee={fee_bps}bps")
191
  print(f"{'='*60}")
192
 
193
  data = load_local()
194
  if not data:
195
  raise RuntimeError("No data. Run data_download.py first.")
196
 
197
+ # Normalize ETF columns
198
+ from preprocess import normalize_etf_columns, flatten_columns
199
+ etf_ret = normalize_etf_columns(data["etf_ret"].copy())
200
+ etf_ret = etf_ret[[c for c in config.ETFS if c in etf_ret.columns]]
201
+ bench_ret= normalize_etf_columns(data["bench_ret"].copy())
202
+
203
+ # T-bill series
204
+ macro = flatten_columns(data["macro"].copy())
205
+ tbill = macro["TBILL_3M"] if "TBILL_3M" in macro.columns \
206
+ else pd.Series(3.6, index=macro.index)
207
+
208
+ # Best lookbacks from training summary
209
  summary_path = os.path.join(config.MODELS_DIR, "training_summary.json")
210
+ lb_map = {"model_a": 30, "model_b": 30, "model_c": 30}
211
  if os.path.exists(summary_path):
212
  with open(summary_path) as f:
213
+ s = json.load(f)
214
+ for k in lb_map:
215
+ lb_map[k] = s.get(k, {}).get("best_lookback", 30)
 
 
 
 
 
 
216
 
217
  results = {}
218
 
219
+ for tag, module, is_dual in [
220
+ ("model_a", model_a, False),
221
+ ("model_b", model_b, False),
222
+ ("model_c", model_c, True),
223
  ]:
224
+ lb = lb_map[tag]
225
  print(f"\n Evaluating {tag.upper()} (lb={lb}d)...")
226
+ prep = run_preprocessing(data, lb)
227
 
228
  try:
229
+ m = module.load_model(lb)
230
  except Exception as e:
231
  print(f" Could not load {tag}: {e}")
232
  continue
233
 
234
+ preds = raw_signals(m, prep, is_dual=is_dual)
235
+ probs = softmax_probs(preds)
236
+
237
+ print(f" probs sample (first 3 rows):\n{probs[:3]}")
238
+ print(f" z_scores sample: {compute_z_scores(probs[:5])}")
239
 
240
+ bt = backtest(probs, prep["d_te"], etf_ret, tbill,
241
+ fee_bps=fee_bps, tsl_pct=tsl_pct,
 
 
 
242
  z_reentry=z_reentry)
243
 
244
+ cash_count = (bt["Mode"] == "CASH").sum()
245
+ print(f" CASH days triggered: {cash_count} / {len(bt)}")
246
+ print(f" Signals distribution:\n{bt['Signal'].value_counts()}")
247
+
248
+ metrics = compute_metrics(bt, bench_ret, tbill)
249
  results[tag] = dict(
250
+ metrics = metrics,
251
+ lookback = lb,
252
+ audit_tail = bt.tail(20).to_dict(orient="records"),
253
+ all_signals = bt.to_dict(orient="records"),
254
  )
255
+ print(f" Ann={metrics['ann_return']}% "
256
+ f"Sharpe={metrics['sharpe']} "
257
+ f"MaxDD={metrics['max_drawdown']}% "
258
+ f"CashDays={metrics['cash_days']}")
259
 
260
  # AR(1) baseline
261
+ prep30 = run_preprocessing(data, 30)
262
+ ar1_bt = ar1_backtest(etf_ret, prep30["d_te"])
263
  ar1_rets = ar1_bt["Net_Return"].values
264
+ n = len(ar1_rets)
265
+ ar1_ann = ((1 + pd.Series(ar1_rets)).prod() ** (252/n) - 1) * 100
266
  results["ar1_baseline"] = dict(ann_return=round(float(ar1_ann), 2))
267
 
268
+ # Benchmarks
269
  for bench in config.BENCHMARKS:
270
+ test_dates = prep30["d_te"]
271
+ b_dates = bench_ret.index.intersection(pd.to_datetime(test_dates))
272
+ b_rets = bench_ret.loc[b_dates, bench].values \
273
+ if bench in bench_ret.columns else np.zeros(1)
274
  b_total = (1 + pd.Series(b_rets)).prod()
275
+ b_ann = (b_total ** (252 / max(len(b_rets),1)) - 1) * 100
276
+ b_sh = (b_rets.mean()/(b_rets.std()+1e-8))*np.sqrt(252)
277
  b_cum = np.cumprod(1 + b_rets)
278
  b_peak = np.maximum.accumulate(b_cum)
279
+ b_mdd = float(((b_cum-b_peak)/b_peak).min())*100
280
  results[bench] = dict(
281
  ann_return = round(float(b_ann), 2),
282
+ sharpe = round(float(b_sh), 3),
283
  max_drawdown = round(float(b_mdd), 2),
284
  )
285
 
286
  # Winner
287
+ valid = [k for k in ["model_a","model_b","model_c"] if k in results]
288
+ if valid:
289
+ winner = max(valid,
 
290
  key=lambda k: results[k]["metrics"]["ann_return"])
291
  results["winner"] = winner
292
  print(f"\n ⭐ WINNER: {winner.upper()} "
293
+ f"({results[winner]['metrics']['ann_return']}%)")
294
 
295
  results["evaluated_at"] = datetime.now().isoformat()
296
  results["tsl_pct"] = tsl_pct
297
  results["z_reentry"] = z_reentry
298
 
299
+ with open("evaluation_results.json","w") as f:
 
300
  json.dump(results, f, indent=2, default=str)
301
+ print(f"\n Saved β†’ evaluation_results.json")
302
  return results
303
 
304
 
305
  if __name__ == "__main__":
306
  import argparse
307
  parser = argparse.ArgumentParser()
308
+ parser.add_argument("--tsl", type=float, default=config.DEFAULT_TSL_PCT)
309
+ parser.add_argument("--z", type=float, default=config.DEFAULT_Z_REENTRY)
310
+ parser.add_argument("--fee", type=float, default=10)
311
  args = parser.parse_args()
312
  run_evaluation(tsl_pct=args.tsl, z_reentry=args.z, fee_bps=args.fee)