| |
| """ |
| Train BTC v4 Offline IQL Trading Agent. |
| |
| Usage: |
| python -m rl_btc_v4.train |
| python -m rl_btc_v4.train --data-path /path/to/parquet --epochs 100 --hidden-dim 256 |
| python -m rl_btc_v4.train --help |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import time |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
|
|
| from rl_btc_v4.constants import ( |
| DEFAULT_DATA_PATH, |
| MARKET_FEATURE_COLUMNS, |
| N_ACTIONS, |
| DEFAULT_HISTORY_LENGTH, |
| DEFAULT_EPISODE_SPAN_DAYS, |
| DEFAULT_EPISODE_STRIDE_DAYS, |
| TAKER_FEE_RATE, |
| ) |
| from rl_btc_v4.dataset import build_offline_rl_dataset |
| from rl_btc_v4.iql_trainer import IQLTrainer, IQLConfig |
|
|
|
|
| def train( |
| *, |
| data_path: str | None = None, |
| outdir: str = "artifacts_rl_btc_v4_iql", |
| history_length: int = DEFAULT_HISTORY_LENGTH, |
| episode_span_days: int = DEFAULT_EPISODE_SPAN_DAYS, |
| episode_stride_days: int = DEFAULT_EPISODE_STRIDE_DAYS, |
| hidden_dim: int = 256, |
| num_layers: int = 2, |
| dropout: float = 0.0, |
| expectile: float = 0.7, |
| temperature: float = 3.0, |
| gamma: float = 0.99, |
| tau: float = 0.005, |
| learning_rate: float = 3e-4, |
| batch_size: int = 256, |
| num_epochs: int = 100, |
| weight_decay: float = 1e-4, |
| risk_lambda: float = 1.0, |
| soft_dd_penalty: float = 0.50, |
| behavioral_policy_mode: str = "conservative", |
| behavioral_temperature: float = 1.0, |
| min_trade_edge: float = 0.005, |
| behavioral_epsilon: float = 0.03, |
| taker_fee_rate: float = TAKER_FEE_RATE, |
| test_fraction: float = 0.2, |
| seed: int = 42, |
| device: str | None = None, |
| ) -> dict: |
| """Full training pipeline.""" |
| out_path = Path(outdir) |
| out_path.mkdir(parents=True, exist_ok=True) |
|
|
| data_path = Path(data_path) if data_path else DEFAULT_DATA_PATH |
| import torch |
| if device is None: |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| print(f"[v4 IQL] Building offline RL dataset from {data_path}") |
| print(f" History length: {history_length}") |
| print(f" Episode span: {episode_span_days}d, stride: {episode_stride_days}d") |
| print(f" Risk lambda: {risk_lambda}, DD penalty: {soft_dd_penalty}") |
|
|
| train_dataset, test_dataset = build_offline_rl_dataset( |
| data_path=data_path, |
| history_length=history_length, |
| episode_span_days=episode_span_days, |
| episode_stride_days=episode_stride_days, |
| risk_lambda=risk_lambda, |
| soft_dd_penalty=soft_dd_penalty, |
| behavioral_policy_mode=behavioral_policy_mode, |
| behavioral_temperature=behavioral_temperature, |
| min_trade_edge=min_trade_edge, |
| behavioral_epsilon=behavioral_epsilon, |
| test_fraction=test_fraction, |
| seed=seed, |
| taker_fee_rate=taker_fee_rate, |
| ) |
|
|
| print(f"[v4 IQL] Train dataset: {train_dataset.to_dict()}") |
| print(f"[v4 IQL] Test dataset: {test_dataset.to_dict()}") |
|
|
| |
| dataset_info = { |
| "train": train_dataset.to_dict(), |
| "test": test_dataset.to_dict(), |
| } |
| (out_path / "dataset_info.json").write_text(json.dumps(dataset_info, indent=2)) |
|
|
| |
| state_dim = train_dataset.states.shape[1] |
|
|
| |
| config = IQLConfig( |
| hidden_dim=hidden_dim, |
| num_layers=num_layers, |
| dropout=dropout, |
| expectile=expectile, |
| temperature=temperature, |
| gamma=gamma, |
| tau=tau, |
| learning_rate=learning_rate, |
| batch_size=batch_size, |
| num_epochs=num_epochs, |
| weight_decay=weight_decay, |
| device=device, |
| seed=seed, |
| ) |
|
|
| print(f"\n[v4 IQL] Training IQL on {device}") |
| print(f" State dim: {state_dim}, Action dim: {N_ACTIONS}") |
| print(f" Hidden: {hidden_dim}, Layers: {num_layers}") |
| print(f" Expectile: {expectile}, Temperature: {temperature}") |
| print(f" LR: {learning_rate}, Batch: {batch_size}, Epochs: {num_epochs}") |
|
|
| trainer = IQLTrainer(state_dim=state_dim, action_dim=N_ACTIONS, config=config) |
|
|
| t_start = time.time() |
|
|
| def progress_fn(epoch, metrics): |
| if epoch % config.eval_freq == 0: |
| elapsed = time.time() - t_start |
| print(f" [{elapsed:.0f}s] Epoch {epoch}: " |
| f"Q={metrics['q_loss']:.4f} V={metrics['v_loss']:.4f} " |
| f"π={metrics['policy_loss']:.4f} " |
| f"Adv={metrics['advantage']:.4f}") |
|
|
| result = trainer.train( |
| states=train_dataset.states, |
| actions=train_dataset.actions, |
| rewards=train_dataset.rewards, |
| next_states=train_dataset.next_states, |
| dones=train_dataset.dones, |
| eval_states=test_dataset.states, |
| eval_rewards=test_dataset.rewards, |
| progress_fn=progress_fn, |
| ) |
|
|
| t_elapsed = time.time() - t_start |
| print(f"\n[v4 IQL] Training complete in {t_elapsed:.1f}s") |
| print(f"[v4 IQL] Final metrics: {result['final_metrics']}") |
|
|
| |
| trainer.save(out_path) |
|
|
| |
| np.savez( |
| out_path / "scaler.npz", |
| mean=train_dataset.mean, |
| std=train_dataset.std, |
| reward_mean=result["reward_mean"], |
| reward_std=result["reward_std"], |
| ) |
|
|
| |
| report = { |
| "algorithm": "IQL", |
| "config": config.__dict__, |
| "dataset": { |
| "path": str(data_path), |
| "history_length": history_length, |
| "episode_span_days": episode_span_days, |
| "episode_stride_days": episode_stride_days, |
| "risk_lambda": risk_lambda, |
| "soft_dd_penalty": soft_dd_penalty, |
| "behavioral_policy_mode": behavioral_policy_mode, |
| "behavioral_temperature": behavioral_temperature, |
| "min_trade_edge": min_trade_edge, |
| "behavioral_epsilon": behavioral_epsilon, |
| "taker_fee_rate": taker_fee_rate, |
| }, |
| "results": result, |
| "training_time_seconds": t_elapsed, |
| "device": device, |
| } |
| (out_path / "train_report.json").write_text(json.dumps(report, indent=2)) |
|
|
| |
| test_states = test_dataset.states |
| test_actions = test_dataset.actions |
| test_rewards = test_dataset.rewards |
|
|
| with torch.no_grad(): |
| states_t = torch.FloatTensor(test_states).to(trainer.device) |
| logits = trainer.policy_net(states_t) |
| predicted_actions = torch.argmax(logits, dim=-1).cpu().numpy() |
|
|
| agreement = float(np.mean(predicted_actions == test_actions)) |
| print(f"\n[v4 IQL] Policy-behavior agreement on test set: {agreement:.4f}") |
|
|
| report["test_agreement"] = agreement |
| (out_path / "train_report.json").write_text(json.dumps(report, indent=2)) |
|
|
| print(f"\n[v4 IQL] Artifacts saved to {out_path}") |
| print(f" - iql_model.pt") |
| print(f" - scaler.npz") |
| print(f" - train_report.json") |
| print(f" - dataset_info.json") |
|
|
| return report |
|
|
|
|
| def build_arg_parser() -> argparse.ArgumentParser: |
| parser = argparse.ArgumentParser(description="Train BTC v4 IQL Trading Agent.") |
| parser.add_argument("--data-path", default=None, help="Path to parquet dataset") |
| parser.add_argument("--outdir", default="artifacts_rl_btc_v4_iql") |
| parser.add_argument("--history-length", type=int, default=DEFAULT_HISTORY_LENGTH) |
| parser.add_argument("--episode-span-days", type=int, default=DEFAULT_EPISODE_SPAN_DAYS) |
| parser.add_argument("--episode-stride-days", type=int, default=DEFAULT_EPISODE_STRIDE_DAYS) |
| parser.add_argument("--hidden-dim", type=int, default=256) |
| parser.add_argument("--num-layers", type=int, default=2) |
| parser.add_argument("--dropout", type=float, default=0.0) |
| parser.add_argument("--expectile", type=float, default=0.7) |
| parser.add_argument("--temperature", type=float, default=3.0) |
| parser.add_argument("--gamma", type=float, default=0.99) |
| parser.add_argument("--tau", type=float, default=0.005) |
| parser.add_argument("--learning-rate", type=float, default=3e-4) |
| parser.add_argument("--batch-size", type=int, default=256) |
| parser.add_argument("--epochs", type=int, default=100) |
| parser.add_argument("--weight-decay", type=float, default=1e-4) |
| parser.add_argument("--risk-lambda", type=float, default=1.0) |
| parser.add_argument("--soft-dd-penalty", type=float, default=0.50) |
| parser.add_argument("--behavioral-policy-mode", choices=("conservative", "softmax"), default="conservative") |
| parser.add_argument("--behavioral-temperature", type=float, default=1.0) |
| parser.add_argument("--min-trade-edge", type=float, default=0.005) |
| parser.add_argument("--behavioral-epsilon", type=float, default=0.03) |
| parser.add_argument("--taker-fee-rate", type=float, default=TAKER_FEE_RATE) |
| parser.add_argument("--test-fraction", type=float, default=0.2) |
| parser.add_argument("--seed", type=int, default=42) |
| parser.add_argument("--device", default=None) |
| return parser |
|
|
|
|
| def main(argv: list[str] | None = None) -> None: |
| parser = build_arg_parser() |
| args = parser.parse_args(argv) |
| result = train( |
| data_path=args.data_path, |
| outdir=args.outdir, |
| history_length=args.history_length, |
| episode_span_days=args.episode_span_days, |
| episode_stride_days=args.episode_stride_days, |
| hidden_dim=args.hidden_dim, |
| num_layers=args.num_layers, |
| dropout=args.dropout, |
| expectile=args.expectile, |
| temperature=args.temperature, |
| gamma=args.gamma, |
| tau=args.tau, |
| learning_rate=args.learning_rate, |
| batch_size=args.batch_size, |
| num_epochs=args.epochs, |
| weight_decay=args.weight_decay, |
| risk_lambda=args.risk_lambda, |
| soft_dd_penalty=args.soft_dd_penalty, |
| behavioral_policy_mode=args.behavioral_policy_mode, |
| behavioral_temperature=args.behavioral_temperature, |
| min_trade_edge=args.min_trade_edge, |
| behavioral_epsilon=args.behavioral_epsilon, |
| taker_fee_rate=args.taker_fee_rate, |
| test_fraction=args.test_fraction, |
| seed=args.seed, |
| device=args.device, |
| ) |
| print(json.dumps(result, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|