Spaces:

commanderzee
/

eth-arb-trainer

Sleeping

App Files Files Community

eth-arb-trainer / train.py

commanderzee

hybrid edge-margin policy (edge_threshold + entry_price_max cap) — BTC +$1247

7d54c7c verified about 1 month ago

raw

history blame contribute delete

30.5 kB

	"""
	Training entry point for the ARB-MAX v1_arb model.

	Flow:
	1. Download markets_index + ohlcv + orderbook (filtered to asset's slugs).
	2. For each window: build_window_frame @ tick=120, extract features,
	compute optimal arb PnL & binary label.
	3. Assemble pandas frame, split 90/10, walk-forward 5-fold on 90%.
	4. Optuna TPE (seed=42, maximize) with objective =
	mean across folds of (sum_flagged_pnl - 0.2 * max_drawdown).
	5. Refit best params on full 90%, evaluate on 10% holdout.
	6. Save artifacts, upload to HF dataset `commanderzee/{asset}-v1-arb-full`.
	"""

	from __future__ import annotations

	import json
	import os
	import time
	import traceback
	from pathlib import Path
	from typing import Callable, Dict, List, Optional, Tuple

	import numpy as np
	import pandas as pd

	import lightgbm as lgb
	import optuna
	from huggingface_hub import HfApi
	from huggingface_hub.utils import HfHubHTTPError
	from sklearn.metrics import roc_auc_score

	from data_loader import (
	build_window_frame,
	get_window_label,
	load_markets_index,
	load_ohlcv,
	load_orderbook_filtered,
	)
	from features_ohlcv import FEATURE_NAMES as OHLCV_FEATS, extract as extract_ohlcv
	from features_orderbook import (
	FEATURE_NAMES as OB_FEATS,
	extract as extract_ob,
	)
	from features_state import FEATURE_NAMES as STATE_FEATS, extract as extract_state
	from labels import (
	compute_optimal_arb,
	simulate_live_policy,
	simulate_live_policy_from_eff,
	_precompute_effective_arr,
	)
	from replay import simulate_flagging_pnl


	ALL_FEATURES: List[str] = list(OHLCV_FEATS) + list(OB_FEATS) + list(STATE_FEATS)


	# ---------------------------------------------------------------------------
	# Feature assembly per window
	# ---------------------------------------------------------------------------
	def _extract_all(wf: pd.DataFrame, at_tick: int = 120) -> np.ndarray:
	a = extract_ohlcv(wf, at_tick=at_tick)
	b = extract_ob(wf, at_tick=at_tick)
	c = extract_state(wf, at_tick=at_tick)
	out = np.concatenate([a, b, c], dtype=np.float32)
	assert out.shape[0] == len(ALL_FEATURES)
	return out


	# ---------------------------------------------------------------------------
	# HF upload retry wrapper
	# ---------------------------------------------------------------------------
	def _upload_with_retry(
	api: HfApi,
	folder_path: str,
	repo_id: str,
	repo_type: str,
	hf_token: str,
	max_attempts: int = 6,
	log: Callable[[str], None] = print,
	) -> None:
	attempt = 0
	while attempt < max_attempts:
	try:
	api.upload_folder(
	folder_path=folder_path,
	repo_id=repo_id,
	repo_type=repo_type,
	token=hf_token,
	create_pr=False,
	)
	return
	except HfHubHTTPError as e: # type: ignore[attr-defined]
	status = getattr(getattr(e, "response", None), "status_code", None)
	if status is not None and (status == 429 or 500 <= status < 600):
	sleep_s = min(60.0, 2.0 ** attempt)
	log(f"[upload] HTTP {status}; retry in {sleep_s}s")
	time.sleep(sleep_s)
	attempt += 1
	continue
	raise
	except Exception as e: # noqa: BLE001
	sleep_s = min(60.0, 2.0 ** attempt)
	log(f"[upload] error {e!r}; retry in {sleep_s}s")
	time.sleep(sleep_s)
	attempt += 1
	raise RuntimeError(f"upload_folder failed for {repo_id} after {max_attempts} attempts")


	# ---------------------------------------------------------------------------
	# Dataset build
	# ---------------------------------------------------------------------------
	def _build_training_dataset(
	asset: str,
	hf_token: str,
	cache_dir: Path,
	log: Callable[[str], None] = print,
	ob_batch_size: int = 500,
	) -> pd.DataFrame:
	"""Build the per-window training dataframe.

	IMPORTANT: the orderbook parquet for big assets is ~30-40 GB on disk and
	cannot fit in a 32 GB Space's RAM. We iterate the filtered file in batches
	of `ob_batch_size` slugs via `iter_orderbook_batches`, processing each
	batch's windows end-to-end and dropping the polars DataFrame before
	advancing. Peak memory per batch ~1-2 GB.
	"""
	import gc
	import polars as pl # local import to keep module import-light

	from data_loader import iter_orderbook_slug_pairs

	log(f"[data] loading markets_index for {asset}")
	markets = load_markets_index(asset, hf_token, cache_dir)
	log(f"[data] {len(markets)} resolved updown-15m markets")

	log(f"[data] loading ohlcv_1s for {asset}")
	ohlcv = load_ohlcv(asset, hf_token, cache_dir)

	slugs = markets["slug"].to_list()
	slug_ts_list = markets["slug_ts"].to_list()
	slug_ts_map = dict(zip(slugs, [int(t) for t in slug_ts_list]))
	wanted = set(slugs)

	log(f"[data] streaming book_snapshot_5 row-groups (~{len(slugs)} slugs, "
	f"peak ~5 MB per slug)")

	rows: List[Dict] = []
	eff_cache: List[Dict] = [] # parallel to rows — holds up_eff/dn_eff arrays
	built = 0
	skipped = 0
	processed = 0

	for slug, ob_up, ob_dn in iter_orderbook_slug_pairs(
	asset, hf_token, cache_dir, wanted
	):
	processed += 1
	slug_ts = slug_ts_map.get(slug)
	if slug_ts is None:
	continue
	try:
	spot = get_window_label(slug_ts, ohlcv)
	if spot is None:
	skipped += 1
	continue
	wf = build_window_frame(slug, slug_ts, ob_up, ob_dn, ohlcv)
	feats = _extract_all(wf, at_tick=120)
	# DIRECTIONAL label: predict whether UP wins the window.
	# At tick 120 we snapshot both sides' walked-book asks (for
	# 100 shares) so the downstream trading sim can enforce the
	# "lower is better" entry-price rule.
	up_eff = _precompute_effective_arr(wf, "up", size=100)
	dn_eff = _precompute_effective_arr(wf, "dn", size=100)
	up_ask_120 = float(up_eff[120]) if len(up_eff) > 120 else 1.0
	dn_ask_120 = float(dn_eff[120]) if len(dn_eff) > 120 else 1.0
	if not np.isfinite(up_ask_120) or up_ask_120 <= 0:
	up_ask_120 = 1.0
	if not np.isfinite(dn_ask_120) or dn_ask_120 <= 0:
	dn_ask_120 = 1.0
	fee_up = 0.072 * up_ask_120 * (1.0 - up_ask_120) * 100 if 0 < up_ask_120 < 1 else 0.0
	fee_dn = 0.072 * dn_ask_120 * (1.0 - dn_ask_120) * 100 if 0 < dn_ask_120 < 1 else 0.0
	# PnL per side if entered at tick 120 and held to resolution
	if spot == 1: # UP won
	pnl_up = (1.0 - up_ask_120) * 100 - fee_up
	pnl_dn = -dn_ask_120 * 100 - fee_dn
	else: # DN won
	pnl_up = -up_ask_120 * 100 - fee_up
	pnl_dn = (1.0 - dn_ask_120) * 100 - fee_dn
	row = {
	"slug": slug,
	"slug_ts": slug_ts,
	"spot_label": spot,
	"binary_label": int(spot == 1), # 1 if UP wins
	"up_ask_120": up_ask_120,
	"dn_ask_120": dn_ask_120,
	"pnl_up": pnl_up,
	"pnl_dn": pnl_dn,
	"realized_pnl": pnl_up, # kept for schema compat; not used directly
	}
	for name, v in zip(ALL_FEATURES, feats):
	row[name] = float(v)
	rows.append(row)
	eff_cache.append({
	"slug": slug,
	"spot_label": spot,
	"up_eff": np.zeros(1, dtype=np.float32), # unused in v4 mode
	"dn_eff": np.zeros(1, dtype=np.float32),
	})
	built += 1
	except Exception as e: # noqa: BLE001
	skipped += 1
	if skipped <= 3:
	import traceback
	log(f"[data] window error slug={slug}: {e!r}\n"
	f"{traceback.format_exc()}")
	else:
	log(f"[data] window error slug={slug}: {e!r}")

	# free the row-group tables before the next one
	del ob_up, ob_dn
	if processed % 1000 == 0:
	gc.collect()
	log(f"[data] processed={processed} built={built} skipped={skipped}")

	gc.collect()
	log(f"[data] done streaming. processed={processed} built={built} skipped={skipped}")

	df = pd.DataFrame(rows)
	if len(df) == 0:
	raise RuntimeError("No valid training rows produced")
	df = df.sort_values("slug_ts").reset_index(drop=True)
	# Directional-mode diagnostics
	import numpy as _np
	if "realized_pnl" in df.columns:
	pnls = df["realized_pnl"].to_numpy()
	pcts = _np.percentile(pnls, [1, 5, 25, 50, 75, 95, 99])
	log(f"[data] realized_pnl(buy_up_@120) percentiles "
	f"1%={pcts[0]:.2f} 5%={pcts[1]:.2f} 25%={pcts[2]:.2f} "
	f"50%={pcts[3]:.2f} 75%={pcts[4]:.2f} 95%={pcts[5]:.2f} 99%={pcts[6]:.2f}")
	if "up_ask_120" in df.columns:
	ua = df["up_ask_120"].to_numpy()
	da = df["dn_ask_120"].to_numpy()
	log(f"[data] up_ask_120: median={_np.median(ua):.3f} "
	f"5%={_np.quantile(ua, 0.05):.3f} 95%={_np.quantile(ua, 0.95):.3f} \| "
	f"dn_ask_120: median={_np.median(da):.3f}")
	cheap_either = float(((ua <= 0.40) \| (da <= 0.40)).mean())
	cheap_both_lo = float(((ua <= 0.35) \| (da <= 0.35)).mean())
	log(f"[data] entry opportunity: "
	f"some_side≤$0.40={cheap_either:.3f} some_side≤$0.35={cheap_both_lo:.3f}")
	# re-sort eff_cache to mirror df row order (df was just sorted by slug_ts)
	slug_to_idx = {c["slug"]: i for i, c in enumerate(eff_cache)}
	reordered = [eff_cache[slug_to_idx[s]] for s in df["slug"].tolist() if s in slug_to_idx]
	eff_cache[:] = reordered
	df.attrs["eff_cache"] = eff_cache
	log(f"[data] final dataset rows: {len(df)}; "
	f"positive rate: {df['binary_label'].mean():.3f}")
	return df


	# ---------------------------------------------------------------------------
	# Policy-parameter Optuna
	# ---------------------------------------------------------------------------
	def _policy_optuna(
	eff_cache: List[Dict],
	n_trials: int = 100,
	log: Callable[[str], None] = print,
	) -> Dict:
	"""Optuna over the trading policy's own parameters (entry_threshold,
	combined_threshold, min_tick) — NOT the classifier's hyperparameters.

	Objective: mean realized PnL per window across the full pool. Uses the
	cached walked-book effective-ask arrays so each trial is pure numpy (~1s
	for 16k windows), making 100 trials a few minutes.
	"""
	if not eff_cache:
	log("[policy] no eff_cache available; skipping policy Optuna")
	return {"params": {"entry_threshold": 0.40, "combined_threshold": 0.80,
	"min_tick": 120}, "mean_pnl": 0.0}

	# precompute common arrays
	ups = [c["up_eff"] for c in eff_cache]
	dns = [c["dn_eff"] for c in eff_cache]
	labels = [c["spot_label"] for c in eff_cache]

	def _mean_pnl(ent: float, comb: float, tick: int) -> float:
	total = 0.0
	n = 0
	for u, d, lab in zip(ups, dns, labels):
	res = simulate_live_policy_from_eff(
	u.astype(np.float64), d.astype(np.float64), lab,
	entry_threshold=ent, combined_threshold=comb,
	size=100, min_tick=tick, binary_threshold=20.0,
	)
	total += res["realized_pnl"]
	n += 1
	return total / max(n, 1)

	def _obj(trial: optuna.Trial) -> float:
	ent = trial.suggest_float("entry_threshold", 0.20, 0.49)
	comb = trial.suggest_float("combined_threshold", 0.55, 0.95)
	tick = trial.suggest_int("min_tick", 120, 300, step=30)
	if comb - ent < 0.10: # combined must be meaningfully > entry
	return -1e6
	return _mean_pnl(ent, comb, tick)

	sampler = optuna.samplers.TPESampler(seed=42)
	study = optuna.create_study(direction="maximize", sampler=sampler)
	log(f"[policy] Optuna over {len(eff_cache)} windows × {n_trials} trials")
	study.optimize(_obj, n_trials=n_trials, show_progress_bar=False)
	return {"params": study.best_params, "mean_pnl": float(study.best_value)}


	# ---------------------------------------------------------------------------
	# Walk-forward folds
	# ---------------------------------------------------------------------------
	def _walk_forward_folds(n: int, n_folds: int = 5) -> List[Tuple[int, int, int]]:
	"""Return list of (train_end, test_start, test_end) triples.

	Expanding-window walk-forward over the WF pool [0, n). The pool is split
	into (n_folds + 1) contiguous segments. Fold k trains on the first (k+1)
	segments and tests on segment (k+1). No overlap with the final holdout
	(caller passes `n` = length of the 90% pool, not the full dataset).
	Fold k: train [0, (k+1)/(n_folds+1) * n), test [(k+1)/(n_folds+1) * n, (k+2)/(n_folds+1) * n)
	"""
	out: List[Tuple[int, int, int]] = []
	for k in range(n_folds):
	train_end = int(round((k + 1) / (n_folds + 1) * n))
	test_end = int(round((k + 2) / (n_folds + 1) * n))
	test_start = train_end
	if test_end <= test_start:
	continue
	out.append((train_end, test_start, test_end))
	return out


	# ---------------------------------------------------------------------------
	# Optuna objective
	# ---------------------------------------------------------------------------
	def _train_fold_core(
	X_train: np.ndarray,
	y_train: np.ndarray,
	X_test: np.ndarray,
	y_test: np.ndarray,
	params: Dict,
	) -> Dict:
	"""Train LightGBM on fold's train slice with internal early-stopping,
	predict on X_test. Returns preds + booster + AUC. No PnL / threshold
	logic here — that's applied per-trial in the Optuna objective so the
	same trained model can be reused across many threshold combos."""
	n_tr = X_train.shape[0]
	es_cut = max(200, int(0.9 * n_tr)) if n_tr >= 500 else n_tr
	X_fit, y_fit = X_train[:es_cut], y_train[:es_cut]
	X_es, y_es = X_train[es_cut:], y_train[es_cut:]

	params_no_n = {k: v for k, v in params.items() if k != "n_estimators"}
	dtrain = lgb.Dataset(X_fit, label=y_fit, feature_name=ALL_FEATURES)

	callbacks = [lgb.log_evaluation(0)]
	valid_sets = None
	if len(X_es) >= 20 and len(set(y_es.tolist())) > 1:
	dvalid = lgb.Dataset(
	X_es, label=y_es, feature_name=ALL_FEATURES, reference=dtrain
	)
	valid_sets = [dvalid]
	callbacks.insert(0, lgb.early_stopping(50, verbose=False))

	booster = lgb.train(
	params_no_n, dtrain,
	num_boost_round=params.get("n_estimators", 500),
	valid_sets=valid_sets, callbacks=callbacks,
	)
	best_iter = booster.best_iteration if valid_sets is not None else None
	preds = booster.predict(X_test, num_iteration=best_iter)
	try:
	auc = float(roc_auc_score(y_test, preds)) if len(set(y_test.tolist())) > 1 else 0.5
	except Exception:
	auc = 0.5
	return {"auc": auc, "booster": booster, "preds": preds}


	def _directional_policy_apply(
	preds: np.ndarray,
	up_asks: np.ndarray,
	dn_asks: np.ndarray,
	pnl_ups: np.ndarray,
	pnl_dns: np.ndarray,
	edge_threshold: float,
	entry_price_max: float = 1.0,
	) -> Tuple[np.ndarray, np.ndarray]:
	"""Edge-margin decision rule.

	On Polymarket the ask for side S is approximately the market's implied
	P(S wins). So `edge_up = model_pred - up_ask` = our estimate minus the
	market's — the expected PnL per share before fees. If this exceeds a
	threshold on either side, we trade that side. Whichever edge is larger
	wins when both clear threshold.

	Single-knob policy (one Optuna param) — drops the prior 3-param setup
	(p_up_threshold, p_dn_threshold, entry_price_max) which was
	susceptible to overfit in Optuna's larger search space."""
	n = len(preds)
	flag = np.zeros(n, dtype=bool)
	pnl = np.zeros(n, dtype=np.float64)
	for i in range(n):
	edge_up = preds[i] - up_asks[i]
	edge_dn = (1.0 - preds[i]) - dn_asks[i]
	up_ok = (edge_up >= edge_threshold) and (up_asks[i] <= entry_price_max)
	dn_ok = (edge_dn >= edge_threshold) and (dn_asks[i] <= entry_price_max)
	if up_ok and (not dn_ok or edge_up >= edge_dn):
	flag[i] = True
	pnl[i] = pnl_ups[i]
	elif dn_ok:
	flag[i] = True
	pnl[i] = pnl_dns[i]
	return flag, pnl


	def _lgb_params_from_trial(trial: optuna.Trial) -> Tuple[Dict, Dict]:
	params = {
	"objective": "binary",
	"metric": "binary_logloss",
	"verbosity": -1,
	"boosting_type": "gbdt",
	"num_leaves": trial.suggest_int("num_leaves", 31, 127),
	"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
	"feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
	"bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
	"bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
	"min_child_samples": trial.suggest_int("min_child_samples", 20, 200),
	"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
	"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
	"n_estimators": 500,
	}
	trading = {
	# primary: edge-margin (model_pred - market_ask for the chosen side)
	"edge_threshold": trial.suggest_float("edge_threshold", 0.03, 0.25),
	# safety cap: block high-entry overconfident trades where the
	# model's probability is likely uncalibrated. Without this, BTC
	# lost -$621 on a run (vs +$390 with the cap).
	"entry_price_max": trial.suggest_float("entry_price_max", 0.40, 0.65),
	}
	return params, trading


	# ---------------------------------------------------------------------------
	# Calibration table (10 buckets)
	# ---------------------------------------------------------------------------
	def _calibration_buckets(preds: np.ndarray, y: np.ndarray, n_buckets: int = 10) -> List[Dict]:
	preds = np.asarray(preds)
	y = np.asarray(y)
	edges = np.linspace(0.0, 1.0, n_buckets + 1)
	out = []
	for i in range(n_buckets):
	lo, hi = edges[i], edges[i + 1]
	mask = (preds >= lo) & (preds < hi) if i < n_buckets - 1 else (preds >= lo) & (preds <= hi)
	n_here = int(mask.sum())
	if n_here == 0:
	out.append({"bucket": i, "lo": lo, "hi": hi, "n": 0, "predicted": 0.0, "actual": 0.0})
	continue
	out.append(
	{
	"bucket": i,
	"lo": float(lo),
	"hi": float(hi),
	"n": n_here,
	"predicted": float(preds[mask].mean()),
	"actual": float(y[mask].mean()),
	}
	)
	return out


	# ---------------------------------------------------------------------------
	# Main entry
	# ---------------------------------------------------------------------------
	def run_training(
	asset: str,
	hf_token: str,
	variant: str = "v1_arb",
	n_trials: int = 150,
	n_folds: int = 5,
	log: Callable[[str], None] = print,
	cache_dir: str = "/tmp/cache",
	output_dir: str = "/tmp/output",
	) -> Dict:
	cache_path = Path(cache_dir)
	out_path = Path(output_dir)
	cache_path.mkdir(parents=True, exist_ok=True)
	out_path.mkdir(parents=True, exist_ok=True)

	log(f"[train] asset={asset} variant={variant} trials={n_trials} folds={n_folds}")

	# 1. Build dataset
	df = _build_training_dataset(asset, hf_token, cache_path, log=log)
	df = df.sort_values("slug_ts").reset_index(drop=True)

	# DIRECTIONAL: label is "UP wins the window?" (spot_label), 1/0 balanced.
	# Per row we also track entry-price (tick 120) for each side and the
	# realized per-side PnL if we bought 100 shares at that price and held
	# to resolution. Asymmetry: the lower the ask we buy at, the larger the
	# payout on a correct call — this is enforced at trading-policy time
	# via the Optuna-tuned `entry_price_max` cap.
	log(f"[train] directional: up_win_rate={float(df['binary_label'].mean()):.3f} "
	f"mean_pnl_up={float(df['pnl_up'].mean()):.2f} "
	f"mean_pnl_dn={float(df['pnl_dn'].mean()):.2f} "
	f"up_ask_median={float(df['up_ask_120'].median()):.3f} "
	f"dn_ask_median={float(df['dn_ask_120'].median()):.3f}")

	X_all = df[ALL_FEATURES].to_numpy(dtype=np.float32)
	y_all = df["binary_label"].to_numpy(dtype=np.int32)
	up_ask_all = df["up_ask_120"].to_numpy(dtype=np.float64)
	dn_ask_all = df["dn_ask_120"].to_numpy(dtype=np.float64)
	pnl_up_all = df["pnl_up"].to_numpy(dtype=np.float64)
	pnl_dn_all = df["pnl_dn"].to_numpy(dtype=np.float64)

	n = len(df)
	split = int(round(0.9 * n))
	X_pool = X_all[:split]
	y_pool = y_all[:split]
	up_ask_pool = up_ask_all[:split]
	dn_ask_pool = dn_ask_all[:split]
	pnl_up_pool = pnl_up_all[:split]
	pnl_dn_pool = pnl_dn_all[:split]
	X_hold = X_all[split:]
	y_hold = y_all[split:]
	up_ask_hold = up_ask_all[split:]
	dn_ask_hold = dn_ask_all[split:]
	pnl_up_hold = pnl_up_all[split:]
	pnl_dn_hold = pnl_dn_all[split:]
	log(f"[train] pool={len(X_pool)} holdout={len(X_hold)}")

	folds = _walk_forward_folds(len(X_pool), n_folds=n_folds)

	def _objective(trial: optuna.Trial) -> float:
	params, trading = _lgb_params_from_trial(trial)
	fold_scores = []
	for (train_end, test_start, test_end) in folds:
	Xt = X_pool[:train_end]
	yt = y_pool[:train_end]
	Xv = X_pool[test_start:test_end]
	yv = y_pool[test_start:test_end]
	uav = up_ask_pool[test_start:test_end]
	dav = dn_ask_pool[test_start:test_end]
	pup_v = pnl_up_pool[test_start:test_end]
	pdn_v = pnl_dn_pool[test_start:test_end]
	if len(Xt) < 50 or len(Xv) < 20 or len(set(yt.tolist())) < 2:
	continue
	try:
	res = _train_fold_core(Xt, yt, Xv, yv, params)
	except Exception as e: # noqa: BLE001
	log(f"[optuna] fold error: {e!r}")
	continue
	flag, pnl = _directional_policy_apply(
	res["preds"], uav, dav, pup_v, pdn_v,
	trading["edge_threshold"],
	trading["entry_price_max"],
	)
	sim = simulate_flagging_pnl(pnl, flag)
	score = sim["sum_flagged"] - 0.2 * sim["max_drawdown"]
	fold_scores.append(score)
	if not fold_scores:
	return -1e9
	return float(np.mean(fold_scores))

	sampler = optuna.samplers.TPESampler(seed=42)
	study = optuna.create_study(direction="maximize", sampler=sampler)

	log(f"[optuna] starting with {n_trials} trials")
	study.optimize(_objective, n_trials=n_trials, show_progress_bar=False)

	best_params_trial = dict(study.best_trial.params)
	best_trading = {
	"edge_threshold": float(best_params_trial.pop("edge_threshold")),
	"entry_price_max": float(best_params_trial.pop("entry_price_max")),
	}
	best_lgb_params = {
	"objective": "binary",
	"metric": "binary_logloss",
	"verbosity": -1,
	"boosting_type": "gbdt",
	"n_estimators": 500,
	**best_params_trial,
	}
	log(f"[optuna] best value={study.best_value:.4f} trading={best_trading}")
	log(f"[optuna] best lgb params: {best_params_trial}")

	fold_metrics: List[Dict] = []
	per_fold_thresholds: Dict[str, float] = {}

	for k, (train_end, test_start, test_end) in enumerate(folds):
	Xt = X_pool[:train_end]
	yt = y_pool[:train_end]
	Xv = X_pool[test_start:test_end]
	yv = y_pool[test_start:test_end]
	uav = up_ask_pool[test_start:test_end]
	dav = dn_ask_pool[test_start:test_end]
	pup_v = pnl_up_pool[test_start:test_end]
	pdn_v = pnl_dn_pool[test_start:test_end]
	if len(Xt) < 50 or len(Xv) < 20 or len(set(yt.tolist())) < 2:
	fold_metrics.append({"fold": k, "skipped": True})
	continue
	res = _train_fold_core(Xt, yt, Xv, yv, best_lgb_params)
	flag, pnl = _directional_policy_apply(
	res["preds"], uav, dav, pup_v, pdn_v,
	best_trading["edge_threshold"],
	best_trading["entry_price_max"],
	)
	sim = simulate_flagging_pnl(pnl, flag)
	# counterfactual: "trade every window, always buy cheaper side"
	pnl_all_cheaper = np.where(uav <= dav, pup_v, pdn_v)
	cf = simulate_flagging_pnl(pnl_all_cheaper, np.ones_like(pnl_all_cheaper, dtype=bool))
	fold_metrics.append(
	{
	"fold": k,
	"train_end": train_end,
	"test_start": test_start,
	"test_end": test_end,
	"n_train": len(Xt),
	"n_test": len(Xv),
	"auc": res["auc"],
	"n_flagged": sim["n_flagged"],
	"n_total": sim["n_total"],
	"flagged_ratio": sim["flagged_ratio"],
	"realized_sum_pnl": sim["sum_flagged"],
	"realized_mean_pnl_of_flagged": sim["mean_flagged"],
	"realized_median_pnl_of_flagged": sim["median_flagged"],
	"hit_rate": sim["hit_rate"],
	"max_drawdown": sim["max_drawdown"],
	"flag_all_baseline_sum_pnl": cf["sum_flagged"],
	"calibration": _calibration_buckets(res["preds"], yv),
	}
	)

	# 2. Final refit on full pool, evaluate on holdout
	log(f"[train] refit on pool size={len(X_pool)} → predict holdout size={len(X_hold)}")
	dtrain = lgb.Dataset(X_pool, label=y_pool, feature_name=ALL_FEATURES)
	final_booster = lgb.train(
	best_lgb_params,
	dtrain,
	num_boost_round=best_lgb_params.get("n_estimators", 500),
	callbacks=[lgb.log_evaluation(0)],
	)
	if len(X_hold) > 0:
	hold_preds = final_booster.predict(X_hold)
	hold_flag, hold_pnl = _directional_policy_apply(
	hold_preds, up_ask_hold, dn_ask_hold, pnl_up_hold, pnl_dn_hold,
	best_trading["edge_threshold"],
	best_trading["entry_price_max"],
	)
	hold_sim = simulate_flagging_pnl(hold_pnl, hold_flag)
	try:
	hold_auc = (
	float(roc_auc_score(y_hold, hold_preds))
	if len(set(y_hold.tolist())) > 1
	else 0.5
	)
	except Exception:
	hold_auc = 0.5
	hold_cal = _calibration_buckets(hold_preds, y_hold)
	cheaper_hold = np.where(up_ask_hold <= dn_ask_hold, pnl_up_hold, pnl_dn_hold)
	hold_cf = simulate_flagging_pnl(cheaper_hold, np.ones_like(cheaper_hold, dtype=bool))
	else:
	hold_preds = np.array([])
	hold_sim = {
	"sum_flagged": 0.0, "mean_flagged": 0.0, "median_flagged": 0.0,
	"max_drawdown": 0.0, "hit_rate": 0.0, "n_flagged": 0, "n_total": 0,
	"flagged_ratio": 0.0,
	}
	hold_auc = 0.5
	hold_cal = []
	hold_cf = {"sum_flagged": 0.0}

	# 3. Save artifacts
	model_path = out_path / "lgbm_model.txt"
	final_booster.save_model(str(model_path))

	(out_path / "feature_spec.json").write_text(
	json.dumps(
	{
	"ohlcv_features": list(OHLCV_FEATS),
	"orderbook_features": list(OB_FEATS),
	"state_features": list(STATE_FEATS),
	"all_features": list(ALL_FEATURES),
	"n_features": len(ALL_FEATURES),
	},
	indent=2,
	)
	)
	(out_path / "thresholds.json").write_text(
	json.dumps(
	{
	"trading": best_trading,
	"notes": "Edge-margin rule: edge_up = model_pred - up_ask; "
	"edge_dn = (1 - model_pred) - dn_ask. Enter the side "
	"with the larger edge, iff that edge >= edge_threshold. "
	"Edge is expected PnL per share before fees.",
	},
	indent=2,
	)
	)
	(out_path / "fold_metrics.json").write_text(json.dumps(fold_metrics, indent=2))

	trial_values = [
	t.value if t.value is not None else float("nan")
	for t in study.trials
	if t.state == optuna.trial.TrialState.COMPLETE
	]
	(out_path / "optuna_report.json").write_text(
	json.dumps(
	{
	"asset": asset,
	"variant": variant,
	"n_trials_requested": n_trials,
	"n_trials_completed": len(trial_values),
	"best_value": float(study.best_value),
	"best_params": dict(study.best_trial.params),
	"trial_values": trial_values,
	},
	indent=2,
	)
	)
	(out_path / "holdout_metrics.json").write_text(
	json.dumps(
	{
	"n_holdout": len(X_hold),
	"auc": hold_auc,
	"sim": hold_sim,
	"calibration": hold_cal,
	"flag_all_baseline_sum_pnl": hold_cf.get("sum_flagged", 0.0),
	},
	indent=2,
	)
	)

	# 4. Upload
	try:
	api = HfApi(token=hf_token)
	target_repo = f"commanderzee/{asset.lower()}-v1-arb-full"
	try:
	api.create_repo(repo_id=target_repo, repo_type="dataset", exist_ok=True, token=hf_token)
	except Exception as e: # noqa: BLE001
	log(f"[upload] create_repo warning: {e!r}")
	_upload_with_retry(api, str(out_path), target_repo, "dataset", hf_token, log=log)
	log(f"[upload] pushed artifacts to {target_repo}")
	except Exception as e: # noqa: BLE001
	log(f"[upload] FAILED: {e!r}")
	log(traceback.format_exc())

	return {
	"asset": asset,
	"variant": variant,
	"n_windows": int(n),
	"n_pool": int(len(X_pool)),
	"n_holdout": int(len(X_hold)),
	"best_value": float(study.best_value),
	"best_trading": best_trading,
	"holdout": {"auc": hold_auc, "sim": hold_sim},
	"output_dir": str(out_path),
	}


	__all__ = ["run_training", "ALL_FEATURES"]