#!/usr/bin/env python3 """ Cloud training script for rl_btc_v4 IQL — v2 with fixes. Downloads dataset from HF Hub, trains with improved IQL, and uploads model back. Improvements over v1: - Fixed Q-network architecture (discrete: outputs |A| values directly) - Fixed V-update: uses Q-values at dataset actions (not one-hot at next state) - LR scheduling (cosine decay) - Early stopping with best model restoration - TD3-style delayed policy updates - Softmax behavioral policy for diverse dataset (not greedy) - Clipped advantage weights - Proper policy evaluation (action agreement metric) """ import json import os import sys import time from pathlib import Path import numpy as np import torch print(f"PyTorch version: {torch.__version__}") print(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"CUDA device: {torch.cuda.get_device_name(0)}") print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB") # Download dataset from HF Hub from huggingface_hub import hf_hub_download, snapshot_download print("\n[v4 IQL v2] Downloading dataset from HF Hub...") data_path = hf_hub_download( repo_id="fbzu/btc_updown_5m_augmented_v1", filename="btc_updown_5m_augmented_v1.parquet", repo_type="dataset", token=os.environ.get("HF_TOKEN"), ) print(f"[v4 IQL v2] Dataset downloaded to {data_path}") print("[v4 IQL v2] Downloading code from HF Hub...") code_dir = snapshot_download( repo_id="fbzu/rl_btc_v4_iql", repo_type="model", token=os.environ.get("HF_TOKEN"), allow_patterns=["rl_btc_v4/*"], ) sys.path.insert(0, code_dir) from rl_btc_v4.dataset import build_offline_rl_dataset from rl_btc_v4.iql_trainer import IQLTrainer, IQLConfig from rl_btc_v4.constants import N_ACTIONS # Trackio monitoring import trackio run = trackio.init(project="rl_btc_v4_iql", name="iql_training_v4_v2", space_id="fbzu/trackio-rl-btc-v4") # ── Training config ────────────────────────────────────────────────────────── device = "cuda" if torch.cuda.is_available() else "cpu" # Dataset with softmax behavioral policy (temperature=1.0 for diverse actions) train_dataset, test_dataset = build_offline_rl_dataset( data_path=data_path, history_length=30, episode_span_days=30, episode_stride_days=15, risk_lambda=1.0, soft_dd_penalty=0.50, behavioral_temperature=1.0, # softmax temperature for behavioral policy test_fraction=0.2, seed=42, ) print(f"[v4 IQL v2] Train transitions: {train_dataset.n_transitions}") print(f"[v4 IQL v2] Test transitions: {test_dataset.n_transitions}") print(f"[v4 IQL v2] State dim: {train_dataset.states.shape[1]}") print(f"[v4 IQL v2] Train action dist: {dict(zip(*np.unique(train_dataset.actions, return_counts=True)))}") print(f"[v4 IQL v2] Positive reward fraction: {(train_dataset.rewards > 0).mean():.3f}") state_dim = train_dataset.states.shape[1] config = IQLConfig( hidden_dim=256, num_layers=2, dropout=0.1, expectile=0.7, temperature=3.0, gamma=0.99, tau=0.005, learning_rate=3e-4, batch_size=512, num_epochs=100, weight_decay=1e-4, policy_update_freq=2, # TD3-style delayed updates early_stopping_patience=20, # stop if no improvement for 20 evals eval_freq=5, device=device, seed=42, ) trainer = IQLTrainer(state_dim=state_dim, action_dim=N_ACTIONS, config=config) t_start = time.time() def progress_fn(epoch, metrics): trackio.log({ "epoch": epoch, "q_loss": metrics["q_loss"], "v_loss": metrics["v_loss"], "policy_loss": metrics["policy_loss"], "advantage": metrics["advantage"], }) if epoch % config.eval_freq == 0 or epoch == config.num_epochs - 1: elapsed = time.time() - t_start print(f" [{elapsed:.0f}s] Epoch {epoch}: " f"Q={metrics['q_loss']:.4f} V={metrics['v_loss']:.4f} " f"π={metrics['policy_loss']:.4f} Adv={metrics['advantage']:.4f}") result = trainer.train( states=train_dataset.states, actions=train_dataset.actions, rewards=train_dataset.rewards, next_states=train_dataset.next_states, dones=train_dataset.dones, eval_states=test_dataset.states, eval_rewards=test_dataset.rewards, eval_dones=test_dataset.dones, progress_fn=progress_fn, ) t_elapsed = time.time() - t_start print(f"\n[v4 IQL v2] Training complete in {t_elapsed:.1f}s") print(f"[v4 IQL v2] Final metrics: {result['final_metrics']}") print(f"[v4 IQL v2] Best eval reward: {result['best_eval_reward']}") if result.get('final_eval'): print(f"[v4 IQL v2] Final eval: {result['final_eval']}") # ── Save and upload to HF Hub ───────────────────────────────────────────────── import tempfile with tempfile.TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) # Save model trainer.save(tmp_path) # Save normalization stats np.savez( tmp_path / "scaler.npz", mean=train_dataset.mean, std=train_dataset.std, reward_mean=result["reward_mean"], reward_std=result["reward_std"], ) # Save report report = { "algorithm": "IQL", "version": "v2", "config": config.__dict__, "dataset": { "path": "fbzu/btc_updown_5m_augmented_v1", "history_length": 30, "episode_span_days": 30, "episode_stride_days": 15, "risk_lambda": 1.0, "soft_dd_penalty": 0.50, "behavioral_temperature": 1.0, }, "results": result, "training_time_seconds": t_elapsed, "device": device, } (tmp_path / "train_report.json").write_text(json.dumps(report, indent=2)) print("\n[v4 IQL v2] Uploading trained model to HF Hub...") from huggingface_hub import HfApi api = HfApi(token=os.environ.get("HF_TOKEN")) for f in tmp_path.iterdir(): api.upload_file( path_or_fileobj=str(f), path_in_repo=f.name, repo_id="fbzu/rl_btc_v4_iql", repo_type="model", ) print(f" Uploaded {f.name}") print(f"\n[v4 IQL v2] Model uploaded to https://huggingface.co/fbzu/rl_btc_v4_iql") # Final trackio metrics trackio.log({ "training_time_seconds": t_elapsed, "final_q_loss": result["final_metrics"]["q_loss"], "final_v_loss": result["final_metrics"]["v_loss"], "final_policy_loss": result["final_metrics"]["policy_loss"], "final_mean_advantage": result["final_metrics"]["mean_advantage"], "best_eval_reward": result["best_eval_reward"], }) trackio.finish()