Spaces:
Running
Running
[auto] Sync code from GitHub
Browse files- evaluate.py +157 -174
evaluate.py
CHANGED
|
@@ -1,14 +1,8 @@
|
|
| 1 |
# evaluate.py
|
| 2 |
-
# Backtests all 3 models on the test set.
|
| 3 |
-
# Applies trailing stop loss (TSL) + Z-score re-entry logic.
|
| 4 |
-
# Compares vs SPY and AGG buy-and-hold and AR(1) baseline.
|
| 5 |
-
# Outputs evaluation_results.json
|
| 6 |
-
|
| 7 |
import json
|
| 8 |
import os
|
| 9 |
import numpy as np
|
| 10 |
import pandas as pd
|
| 11 |
-
from scipy import stats
|
| 12 |
from datetime import datetime
|
| 13 |
|
| 14 |
import config
|
|
@@ -17,101 +11,84 @@ from preprocess import run_preprocessing
|
|
| 17 |
import model_a, model_b, model_c
|
| 18 |
|
| 19 |
|
| 20 |
-
# βββ Signal
|
| 21 |
|
| 22 |
-
def raw_signals(model, prep
|
| 23 |
-
"""
|
| 24 |
-
Run model on test set.
|
| 25 |
-
Returns predicted log-return matrix (N_test, 5).
|
| 26 |
-
"""
|
| 27 |
X_te = prep["X_te"]
|
| 28 |
if is_dual:
|
| 29 |
-
|
| 30 |
-
inputs = [X_te[:, :, :
|
| 31 |
else:
|
| 32 |
inputs = X_te
|
| 33 |
return model.predict(inputs, verbose=0) # (N, 5)
|
| 34 |
|
| 35 |
|
| 36 |
-
def softmax_probs(preds
|
| 37 |
-
"""
|
|
|
|
|
|
|
| 38 |
e = np.exp(preds - preds.max(axis=1, keepdims=True))
|
| 39 |
-
return e / e.sum(axis=1, keepdims=True)
|
| 40 |
|
| 41 |
|
| 42 |
-
def
|
| 43 |
"""
|
| 44 |
-
Z-score: how many std devs is the top ETF
|
| 45 |
-
|
| 46 |
"""
|
| 47 |
-
top = probs.max(axis=1)
|
| 48 |
-
mu = probs.mean(axis=1)
|
| 49 |
-
sigma = probs.std(axis=1) + 1e-8
|
| 50 |
-
return (top - mu) / sigma
|
| 51 |
|
| 52 |
|
| 53 |
-
# βββ TSL
|
| 54 |
|
| 55 |
-
def backtest(probs
|
| 56 |
-
|
| 57 |
-
etf_returns: pd.DataFrame,
|
| 58 |
-
tbill_series: pd.Series,
|
| 59 |
-
fee_bps: float = 10,
|
| 60 |
-
tsl_pct: float = config.DEFAULT_TSL_PCT,
|
| 61 |
-
z_reentry: float = config.DEFAULT_Z_REENTRY) -> pd.DataFrame:
|
| 62 |
"""
|
| 63 |
-
Day-by-day backtest with
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
- CASH earns daily 3m T-bill rate
|
| 67 |
-
- Re-enter ETF when Z >= z_reentry
|
| 68 |
-
|
| 69 |
-
Returns DataFrame with columns:
|
| 70 |
-
Date, Signal, Confidence, Z_Score, Mode,
|
| 71 |
-
Gross_Return, Fee, Net_Return, Cumulative
|
| 72 |
"""
|
| 73 |
-
|
| 74 |
-
etf_idx = {e: i for i, e in enumerate(config.ETFS)}
|
| 75 |
-
|
| 76 |
records = []
|
| 77 |
in_cash = False
|
| 78 |
-
prev_ret = 0.0
|
| 79 |
-
prev2_ret = 0.0
|
| 80 |
last_signal= None
|
| 81 |
|
| 82 |
-
for i in range(
|
| 83 |
-
date
|
| 84 |
-
prob
|
| 85 |
-
z
|
| 86 |
-
top_i
|
| 87 |
-
etf
|
| 88 |
-
conf
|
| 89 |
|
| 90 |
-
# 2-day cumulative return
|
| 91 |
-
|
| 92 |
|
| 93 |
-
#
|
| 94 |
-
if not in_cash and
|
| 95 |
in_cash = True
|
| 96 |
|
| 97 |
-
#
|
| 98 |
if in_cash and z >= z_reentry:
|
| 99 |
in_cash = False
|
| 100 |
|
| 101 |
-
# Get actual return
|
| 102 |
if date in etf_returns.index:
|
| 103 |
if in_cash:
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
gross_ret = tbill_rate / 252
|
| 107 |
fee = 0.0
|
| 108 |
mode = "CASH"
|
| 109 |
signal = "CASH"
|
| 110 |
else:
|
| 111 |
gross_ret = float(etf_returns.loc[date, etf]) \
|
| 112 |
if etf in etf_returns.columns else 0.0
|
| 113 |
-
|
| 114 |
-
fee = (fee_bps / 10000) if etf != last_signal else 0.0
|
| 115 |
gross_ret -= fee
|
| 116 |
mode = "ETF"
|
| 117 |
signal = etf
|
|
@@ -123,14 +100,14 @@ def backtest(probs: np.ndarray,
|
|
| 123 |
signal = "CASH" if in_cash else etf
|
| 124 |
|
| 125 |
records.append(dict(
|
| 126 |
-
Date
|
| 127 |
-
Signal
|
| 128 |
-
Confidence
|
| 129 |
-
Z_Score
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
))
|
| 135 |
|
| 136 |
prev2_ret = prev_ret
|
|
@@ -143,65 +120,60 @@ def backtest(probs: np.ndarray,
|
|
| 143 |
|
| 144 |
# βββ Performance metrics ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
|
| 146 |
-
def compute_metrics(bt
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
rets = bt["Net_Return"].values
|
| 150 |
-
dates = bt["Date"]
|
| 151 |
-
|
| 152 |
-
# Annualised return
|
| 153 |
n_days = len(rets)
|
| 154 |
-
total = (1 + pd.Series(rets)).prod()
|
| 155 |
-
ann_ret= (total ** (252 / n_days) - 1) * 100
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
excess = rets - tbill_daily.values
|
| 160 |
-
sharpe = (excess.mean() / (excess.std() + 1e-8)) * np.sqrt(252)
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
dd = (cum - peak) / peak
|
| 166 |
-
max_dd= float(dd.min()) * 100
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
| 169 |
max_daily_dd = float(rets.min()) * 100
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
lambda x: (x > 0).mean()).mean()
|
| 175 |
|
| 176 |
-
#
|
| 177 |
-
spy_dates
|
| 178 |
-
spy_rets
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
return dict(
|
| 183 |
-
ann_return = round(
|
| 184 |
-
sharpe = round(
|
| 185 |
-
hit_ratio_15d = round(
|
| 186 |
-
max_drawdown = round(
|
| 187 |
-
max_daily_dd = round(
|
| 188 |
-
vs_spy = round(
|
|
|
|
| 189 |
)
|
| 190 |
|
| 191 |
|
| 192 |
# βββ AR(1) baseline βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 193 |
|
| 194 |
-
def ar1_backtest(etf_returns
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
dates_set = set(pd.to_datetime(test_dates))
|
| 199 |
-
df = etf_returns[etf_returns.index.isin(dates_set)].copy()
|
| 200 |
prev = df.shift(1).fillna(0)
|
| 201 |
for date, row in df.iterrows():
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
out = pd.DataFrame(records)
|
| 206 |
out["Cumulative"] = (1 + out["Net_Return"]).cumprod()
|
| 207 |
return out
|
|
@@ -209,121 +181,132 @@ def ar1_backtest(etf_returns: pd.DataFrame,
|
|
| 209 |
|
| 210 |
# βββ Full evaluation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 211 |
|
| 212 |
-
def run_evaluation(tsl_pct
|
| 213 |
-
z_reentry
|
| 214 |
-
fee_bps
|
| 215 |
|
| 216 |
print(f"\n{'='*60}")
|
| 217 |
-
print(f" Evaluation β TSL={tsl_pct}% Z-reentry={z_reentry}Ο"
|
|
|
|
| 218 |
print(f"{'='*60}")
|
| 219 |
|
| 220 |
data = load_local()
|
| 221 |
if not data:
|
| 222 |
raise RuntimeError("No data. Run data_download.py first.")
|
| 223 |
|
| 224 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
summary_path = os.path.join(config.MODELS_DIR, "training_summary.json")
|
|
|
|
| 226 |
if os.path.exists(summary_path):
|
| 227 |
with open(summary_path) as f:
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
lb_c = summary.get("model_c", {}).get("best_lookback", 30)
|
| 232 |
-
else:
|
| 233 |
-
lb_a = lb_b = lb_c = config.DEFAULT_LOOKBACK
|
| 234 |
-
|
| 235 |
-
tbill = data["macro"]["TBILL_3M"] if "TBILL_3M" in data["macro"].columns \
|
| 236 |
-
else pd.Series(3.6, index=data["macro"].index)
|
| 237 |
|
| 238 |
results = {}
|
| 239 |
|
| 240 |
-
for tag, module,
|
| 241 |
-
("model_a", model_a,
|
| 242 |
-
("model_b", model_b,
|
| 243 |
-
("model_c", model_c,
|
| 244 |
]:
|
|
|
|
| 245 |
print(f"\n Evaluating {tag.upper()} (lb={lb}d)...")
|
| 246 |
-
prep
|
| 247 |
|
| 248 |
try:
|
| 249 |
-
m
|
| 250 |
except Exception as e:
|
| 251 |
print(f" Could not load {tag}: {e}")
|
| 252 |
continue
|
| 253 |
|
| 254 |
-
preds
|
| 255 |
-
probs
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
bt = backtest(probs, prep["d_te"],
|
| 258 |
-
|
| 259 |
-
tbill,
|
| 260 |
-
fee_bps=fee_bps,
|
| 261 |
-
tsl_pct=tsl_pct,
|
| 262 |
z_reentry=z_reentry)
|
| 263 |
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
results[tag] = dict(
|
| 266 |
-
metrics
|
| 267 |
-
lookback
|
| 268 |
-
audit_tail
|
| 269 |
-
all_signals= bt.to_dict(orient="records"),
|
| 270 |
)
|
| 271 |
-
print(f" Ann
|
| 272 |
-
f"Sharpe
|
| 273 |
-
f"MaxDD
|
|
|
|
| 274 |
|
| 275 |
# AR(1) baseline
|
| 276 |
-
|
| 277 |
-
|
| 278 |
ar1_rets = ar1_bt["Net_Return"].values
|
| 279 |
-
|
| 280 |
-
ar1_ann = ((1 + pd.Series(ar1_rets)).prod() ** (252/
|
| 281 |
results["ar1_baseline"] = dict(ann_return=round(float(ar1_ann), 2))
|
| 282 |
|
| 283 |
-
#
|
| 284 |
for bench in config.BENCHMARKS:
|
| 285 |
-
test_dates =
|
| 286 |
-
b_dates =
|
| 287 |
-
b_rets =
|
| 288 |
-
if bench in
|
| 289 |
b_total = (1 + pd.Series(b_rets)).prod()
|
| 290 |
-
b_ann = (b_total ** (252 / max(len(b_rets),
|
| 291 |
-
|
| 292 |
b_cum = np.cumprod(1 + b_rets)
|
| 293 |
b_peak = np.maximum.accumulate(b_cum)
|
| 294 |
-
b_mdd = float(((b_cum
|
| 295 |
results[bench] = dict(
|
| 296 |
ann_return = round(float(b_ann), 2),
|
| 297 |
-
sharpe = round(float(
|
| 298 |
max_drawdown = round(float(b_mdd), 2),
|
| 299 |
)
|
| 300 |
|
| 301 |
# Winner
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
winner = max(valid_models,
|
| 306 |
key=lambda k: results[k]["metrics"]["ann_return"])
|
| 307 |
results["winner"] = winner
|
| 308 |
print(f"\n β WINNER: {winner.upper()} "
|
| 309 |
-
f"({results[winner]['metrics']['ann_return']}%
|
| 310 |
|
| 311 |
results["evaluated_at"] = datetime.now().isoformat()
|
| 312 |
results["tsl_pct"] = tsl_pct
|
| 313 |
results["z_reentry"] = z_reentry
|
| 314 |
|
| 315 |
-
|
| 316 |
-
with open(out_path, "w") as f:
|
| 317 |
json.dump(results, f, indent=2, default=str)
|
| 318 |
-
print(f"\n
|
| 319 |
return results
|
| 320 |
|
| 321 |
|
| 322 |
if __name__ == "__main__":
|
| 323 |
import argparse
|
| 324 |
parser = argparse.ArgumentParser()
|
| 325 |
-
parser.add_argument("--tsl",
|
| 326 |
-
parser.add_argument("--z",
|
| 327 |
-
parser.add_argument("--fee",
|
| 328 |
args = parser.parse_args()
|
| 329 |
run_evaluation(tsl_pct=args.tsl, z_reentry=args.z, fee_bps=args.fee)
|
|
|
|
| 1 |
# evaluate.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
|
|
|
| 6 |
from datetime import datetime
|
| 7 |
|
| 8 |
import config
|
|
|
|
| 11 |
import model_a, model_b, model_c
|
| 12 |
|
| 13 |
|
| 14 |
+
# βββ Signal generation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
|
| 16 |
+
def raw_signals(model, prep, is_dual=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
X_te = prep["X_te"]
|
| 18 |
if is_dual:
|
| 19 |
+
n = prep["n_etf_features"]
|
| 20 |
+
inputs = [X_te[:, :, :n], X_te[:, :, n:]]
|
| 21 |
else:
|
| 22 |
inputs = X_te
|
| 23 |
return model.predict(inputs, verbose=0) # (N, 5)
|
| 24 |
|
| 25 |
|
| 26 |
+
def softmax_probs(preds):
|
| 27 |
+
"""Row-wise softmax β probabilities sum to 1."""
|
| 28 |
+
preds = np.array(preds)
|
| 29 |
+
# Subtract row max for numerical stability
|
| 30 |
e = np.exp(preds - preds.max(axis=1, keepdims=True))
|
| 31 |
+
return e / e.sum(axis=1, keepdims=True) # (N, 5)
|
| 32 |
|
| 33 |
|
| 34 |
+
def compute_z_scores(probs):
|
| 35 |
"""
|
| 36 |
+
Per-row Z-score: how many std devs is the top ETF above the row mean?
|
| 37 |
+
Returns array of shape (N,)
|
| 38 |
"""
|
| 39 |
+
top = probs.max(axis=1) # (N,)
|
| 40 |
+
mu = probs.mean(axis=1) # (N,)
|
| 41 |
+
sigma = probs.std(axis=1) + 1e-8 # (N,)
|
| 42 |
+
return (top - mu) / sigma # (N,)
|
| 43 |
|
| 44 |
|
| 45 |
+
# βββ TSL backtest βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
|
| 47 |
+
def backtest(probs, dates, etf_returns, tbill_series,
|
| 48 |
+
fee_bps=10, tsl_pct=10.0, z_reentry=1.1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"""
|
| 50 |
+
Day-by-day backtest with proper TSL + Z-score re-entry logic.
|
| 51 |
+
tsl_pct: float e.g. 10.0 means trigger at -10% 2-day cumul
|
| 52 |
+
z_reentry: float e.g. 1.1 sigma
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
"""
|
| 54 |
+
z_scores = compute_z_scores(probs) # (N,) β one per day
|
|
|
|
|
|
|
| 55 |
records = []
|
| 56 |
in_cash = False
|
| 57 |
+
prev_ret = 0.0
|
| 58 |
+
prev2_ret = 0.0
|
| 59 |
last_signal= None
|
| 60 |
|
| 61 |
+
for i in range(len(probs)):
|
| 62 |
+
date = pd.Timestamp(dates[i])
|
| 63 |
+
prob = probs[i] # (5,)
|
| 64 |
+
z = float(z_scores[i]) # scalar for this day
|
| 65 |
+
top_i = int(np.argmax(prob))
|
| 66 |
+
etf = config.ETFS[top_i]
|
| 67 |
+
conf = float(prob[top_i]) # already a probability 0-1
|
| 68 |
|
| 69 |
+
# 2-day cumulative return (previous 2 days)
|
| 70 |
+
two_day_cumul_pct = (prev_ret + prev2_ret) * 100
|
| 71 |
|
| 72 |
+
# ββ TSL trigger βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
+
if not in_cash and two_day_cumul_pct <= -tsl_pct:
|
| 74 |
in_cash = True
|
| 75 |
|
| 76 |
+
# ββ Z-score re-entry ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
if in_cash and z >= z_reentry:
|
| 78 |
in_cash = False
|
| 79 |
|
| 80 |
+
# ββ Get actual return βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
if date in etf_returns.index:
|
| 82 |
if in_cash:
|
| 83 |
+
tbill_rate = float(tbill_series.get(date, 3.6))
|
| 84 |
+
gross_ret = (tbill_rate / 100) / 252
|
|
|
|
| 85 |
fee = 0.0
|
| 86 |
mode = "CASH"
|
| 87 |
signal = "CASH"
|
| 88 |
else:
|
| 89 |
gross_ret = float(etf_returns.loc[date, etf]) \
|
| 90 |
if etf in etf_returns.columns else 0.0
|
| 91 |
+
fee = (fee_bps / 10000) if etf != last_signal else 0.0
|
|
|
|
| 92 |
gross_ret -= fee
|
| 93 |
mode = "ETF"
|
| 94 |
signal = etf
|
|
|
|
| 100 |
signal = "CASH" if in_cash else etf
|
| 101 |
|
| 102 |
records.append(dict(
|
| 103 |
+
Date = str(date.date()),
|
| 104 |
+
Signal = signal,
|
| 105 |
+
Confidence = round(conf, 4), # 0-1 float
|
| 106 |
+
Z_Score = round(z, 4), # per-day z
|
| 107 |
+
Two_Day_Cumul_Pct = round(two_day_cumul_pct, 2),
|
| 108 |
+
Mode = mode,
|
| 109 |
+
Net_Return = round(gross_ret, 6),
|
| 110 |
+
TSL_Triggered = in_cash,
|
| 111 |
))
|
| 112 |
|
| 113 |
prev2_ret = prev_ret
|
|
|
|
| 120 |
|
| 121 |
# βββ Performance metrics ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
|
| 123 |
+
def compute_metrics(bt, bench_ret, tbill_series):
|
| 124 |
+
rets = bt["Net_Return"].values
|
| 125 |
+
dates = pd.to_datetime(bt["Date"])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
n_days = len(rets)
|
|
|
|
|
|
|
| 127 |
|
| 128 |
+
total = float((1 + pd.Series(rets)).prod())
|
| 129 |
+
ann_ret = (total ** (252 / n_days) - 1) * 100
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
tbill_daily = tbill_series.reindex(dates).ffill().fillna(3.6) / 100 / 252
|
| 132 |
+
excess = rets - tbill_daily.values
|
| 133 |
+
sharpe = float((excess.mean() / (excess.std() + 1e-8)) * np.sqrt(252))
|
|
|
|
|
|
|
| 134 |
|
| 135 |
+
cum = np.cumprod(1 + rets)
|
| 136 |
+
peak = np.maximum.accumulate(cum)
|
| 137 |
+
dd = (cum - peak) / peak
|
| 138 |
+
max_dd = float(dd.min()) * 100
|
| 139 |
max_daily_dd = float(rets.min()) * 100
|
| 140 |
|
| 141 |
+
signs = np.sign(rets)
|
| 142 |
+
hit_15 = float(pd.Series(signs).rolling(15).apply(
|
| 143 |
+
lambda x: (x > 0).mean()).mean())
|
|
|
|
| 144 |
|
| 145 |
+
# SPY benchmark
|
| 146 |
+
spy_dates = bench_ret.index.intersection(dates)
|
| 147 |
+
spy_rets = bench_ret.loc[spy_dates, "SPY"].values \
|
| 148 |
+
if "SPY" in bench_ret.columns else np.zeros(1)
|
| 149 |
+
spy_total = float((1 + pd.Series(spy_rets)).prod())
|
| 150 |
+
spy_ann = (spy_total ** (252 / max(len(spy_rets), 1)) - 1) * 100
|
| 151 |
+
|
| 152 |
+
# CASH days count
|
| 153 |
+
cash_days = int((bt["Mode"] == "CASH").sum())
|
| 154 |
|
| 155 |
return dict(
|
| 156 |
+
ann_return = round(ann_ret, 2),
|
| 157 |
+
sharpe = round(sharpe, 3),
|
| 158 |
+
hit_ratio_15d = round(hit_15, 3),
|
| 159 |
+
max_drawdown = round(max_dd, 2),
|
| 160 |
+
max_daily_dd = round(max_daily_dd, 2),
|
| 161 |
+
vs_spy = round(ann_ret - spy_ann, 2),
|
| 162 |
+
cash_days = cash_days,
|
| 163 |
)
|
| 164 |
|
| 165 |
|
| 166 |
# βββ AR(1) baseline βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 167 |
|
| 168 |
+
def ar1_backtest(etf_returns, test_dates):
|
| 169 |
+
records = []
|
| 170 |
+
dates_dt = pd.to_datetime(test_dates)
|
| 171 |
+
df = etf_returns[etf_returns.index.isin(dates_dt)].copy()
|
|
|
|
|
|
|
| 172 |
prev = df.shift(1).fillna(0)
|
| 173 |
for date, row in df.iterrows():
|
| 174 |
+
best = prev.loc[date].idxmax()
|
| 175 |
+
records.append(dict(Date=date, Signal=best,
|
| 176 |
+
Net_Return=float(row[best])))
|
| 177 |
out = pd.DataFrame(records)
|
| 178 |
out["Cumulative"] = (1 + out["Net_Return"]).cumprod()
|
| 179 |
return out
|
|
|
|
| 181 |
|
| 182 |
# βββ Full evaluation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 183 |
|
| 184 |
+
def run_evaluation(tsl_pct=config.DEFAULT_TSL_PCT,
|
| 185 |
+
z_reentry=config.DEFAULT_Z_REENTRY,
|
| 186 |
+
fee_bps=10):
|
| 187 |
|
| 188 |
print(f"\n{'='*60}")
|
| 189 |
+
print(f" Evaluation β TSL={tsl_pct}% Z-reentry={z_reentry}Ο "
|
| 190 |
+
f"Fee={fee_bps}bps")
|
| 191 |
print(f"{'='*60}")
|
| 192 |
|
| 193 |
data = load_local()
|
| 194 |
if not data:
|
| 195 |
raise RuntimeError("No data. Run data_download.py first.")
|
| 196 |
|
| 197 |
+
# Normalize ETF columns
|
| 198 |
+
from preprocess import normalize_etf_columns, flatten_columns
|
| 199 |
+
etf_ret = normalize_etf_columns(data["etf_ret"].copy())
|
| 200 |
+
etf_ret = etf_ret[[c for c in config.ETFS if c in etf_ret.columns]]
|
| 201 |
+
bench_ret= normalize_etf_columns(data["bench_ret"].copy())
|
| 202 |
+
|
| 203 |
+
# T-bill series
|
| 204 |
+
macro = flatten_columns(data["macro"].copy())
|
| 205 |
+
tbill = macro["TBILL_3M"] if "TBILL_3M" in macro.columns \
|
| 206 |
+
else pd.Series(3.6, index=macro.index)
|
| 207 |
+
|
| 208 |
+
# Best lookbacks from training summary
|
| 209 |
summary_path = os.path.join(config.MODELS_DIR, "training_summary.json")
|
| 210 |
+
lb_map = {"model_a": 30, "model_b": 30, "model_c": 30}
|
| 211 |
if os.path.exists(summary_path):
|
| 212 |
with open(summary_path) as f:
|
| 213 |
+
s = json.load(f)
|
| 214 |
+
for k in lb_map:
|
| 215 |
+
lb_map[k] = s.get(k, {}).get("best_lookback", 30)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
results = {}
|
| 218 |
|
| 219 |
+
for tag, module, is_dual in [
|
| 220 |
+
("model_a", model_a, False),
|
| 221 |
+
("model_b", model_b, False),
|
| 222 |
+
("model_c", model_c, True),
|
| 223 |
]:
|
| 224 |
+
lb = lb_map[tag]
|
| 225 |
print(f"\n Evaluating {tag.upper()} (lb={lb}d)...")
|
| 226 |
+
prep = run_preprocessing(data, lb)
|
| 227 |
|
| 228 |
try:
|
| 229 |
+
m = module.load_model(lb)
|
| 230 |
except Exception as e:
|
| 231 |
print(f" Could not load {tag}: {e}")
|
| 232 |
continue
|
| 233 |
|
| 234 |
+
preds = raw_signals(m, prep, is_dual=is_dual)
|
| 235 |
+
probs = softmax_probs(preds)
|
| 236 |
+
|
| 237 |
+
print(f" probs sample (first 3 rows):\n{probs[:3]}")
|
| 238 |
+
print(f" z_scores sample: {compute_z_scores(probs[:5])}")
|
| 239 |
|
| 240 |
+
bt = backtest(probs, prep["d_te"], etf_ret, tbill,
|
| 241 |
+
fee_bps=fee_bps, tsl_pct=tsl_pct,
|
|
|
|
|
|
|
|
|
|
| 242 |
z_reentry=z_reentry)
|
| 243 |
|
| 244 |
+
cash_count = (bt["Mode"] == "CASH").sum()
|
| 245 |
+
print(f" CASH days triggered: {cash_count} / {len(bt)}")
|
| 246 |
+
print(f" Signals distribution:\n{bt['Signal'].value_counts()}")
|
| 247 |
+
|
| 248 |
+
metrics = compute_metrics(bt, bench_ret, tbill)
|
| 249 |
results[tag] = dict(
|
| 250 |
+
metrics = metrics,
|
| 251 |
+
lookback = lb,
|
| 252 |
+
audit_tail = bt.tail(20).to_dict(orient="records"),
|
| 253 |
+
all_signals = bt.to_dict(orient="records"),
|
| 254 |
)
|
| 255 |
+
print(f" Ann={metrics['ann_return']}% "
|
| 256 |
+
f"Sharpe={metrics['sharpe']} "
|
| 257 |
+
f"MaxDD={metrics['max_drawdown']}% "
|
| 258 |
+
f"CashDays={metrics['cash_days']}")
|
| 259 |
|
| 260 |
# AR(1) baseline
|
| 261 |
+
prep30 = run_preprocessing(data, 30)
|
| 262 |
+
ar1_bt = ar1_backtest(etf_ret, prep30["d_te"])
|
| 263 |
ar1_rets = ar1_bt["Net_Return"].values
|
| 264 |
+
n = len(ar1_rets)
|
| 265 |
+
ar1_ann = ((1 + pd.Series(ar1_rets)).prod() ** (252/n) - 1) * 100
|
| 266 |
results["ar1_baseline"] = dict(ann_return=round(float(ar1_ann), 2))
|
| 267 |
|
| 268 |
+
# Benchmarks
|
| 269 |
for bench in config.BENCHMARKS:
|
| 270 |
+
test_dates = prep30["d_te"]
|
| 271 |
+
b_dates = bench_ret.index.intersection(pd.to_datetime(test_dates))
|
| 272 |
+
b_rets = bench_ret.loc[b_dates, bench].values \
|
| 273 |
+
if bench in bench_ret.columns else np.zeros(1)
|
| 274 |
b_total = (1 + pd.Series(b_rets)).prod()
|
| 275 |
+
b_ann = (b_total ** (252 / max(len(b_rets),1)) - 1) * 100
|
| 276 |
+
b_sh = (b_rets.mean()/(b_rets.std()+1e-8))*np.sqrt(252)
|
| 277 |
b_cum = np.cumprod(1 + b_rets)
|
| 278 |
b_peak = np.maximum.accumulate(b_cum)
|
| 279 |
+
b_mdd = float(((b_cum-b_peak)/b_peak).min())*100
|
| 280 |
results[bench] = dict(
|
| 281 |
ann_return = round(float(b_ann), 2),
|
| 282 |
+
sharpe = round(float(b_sh), 3),
|
| 283 |
max_drawdown = round(float(b_mdd), 2),
|
| 284 |
)
|
| 285 |
|
| 286 |
# Winner
|
| 287 |
+
valid = [k for k in ["model_a","model_b","model_c"] if k in results]
|
| 288 |
+
if valid:
|
| 289 |
+
winner = max(valid,
|
|
|
|
| 290 |
key=lambda k: results[k]["metrics"]["ann_return"])
|
| 291 |
results["winner"] = winner
|
| 292 |
print(f"\n β WINNER: {winner.upper()} "
|
| 293 |
+
f"({results[winner]['metrics']['ann_return']}%)")
|
| 294 |
|
| 295 |
results["evaluated_at"] = datetime.now().isoformat()
|
| 296 |
results["tsl_pct"] = tsl_pct
|
| 297 |
results["z_reentry"] = z_reentry
|
| 298 |
|
| 299 |
+
with open("evaluation_results.json","w") as f:
|
|
|
|
| 300 |
json.dump(results, f, indent=2, default=str)
|
| 301 |
+
print(f"\n Saved β evaluation_results.json")
|
| 302 |
return results
|
| 303 |
|
| 304 |
|
| 305 |
if __name__ == "__main__":
|
| 306 |
import argparse
|
| 307 |
parser = argparse.ArgumentParser()
|
| 308 |
+
parser.add_argument("--tsl", type=float, default=config.DEFAULT_TSL_PCT)
|
| 309 |
+
parser.add_argument("--z", type=float, default=config.DEFAULT_Z_REENTRY)
|
| 310 |
+
parser.add_argument("--fee", type=float, default=10)
|
| 311 |
args = parser.parse_args()
|
| 312 |
run_evaluation(tsl_pct=args.tsl, z_reentry=args.z, fee_bps=args.fee)
|