rl_btc_v4_iql / train_cloud_v2.py
fbzu's picture
Upload folder using huggingface_hub
22d888b verified
#!/usr/bin/env python3
"""
Cloud training script for rl_btc_v4 IQL β€” v2 with fixes.
Downloads dataset from HF Hub, trains with improved IQL, and uploads model back.
Improvements over v1:
- Fixed Q-network architecture (discrete: outputs |A| values directly)
- Fixed V-update: uses Q-values at dataset actions (not one-hot at next state)
- LR scheduling (cosine decay)
- Early stopping with best model restoration
- TD3-style delayed policy updates
- Softmax behavioral policy for diverse dataset (not greedy)
- Clipped advantage weights
- Proper policy evaluation (action agreement metric)
"""
import json
import os
import sys
import time
from pathlib import Path
import numpy as np
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
# Download dataset from HF Hub
from huggingface_hub import hf_hub_download, snapshot_download
print("\n[v4 IQL v2] Downloading dataset from HF Hub...")
data_path = hf_hub_download(
repo_id="fbzu/btc_updown_5m_augmented_v1",
filename="btc_updown_5m_augmented_v1.parquet",
repo_type="dataset",
token=os.environ.get("HF_TOKEN"),
)
print(f"[v4 IQL v2] Dataset downloaded to {data_path}")
print("[v4 IQL v2] Downloading code from HF Hub...")
code_dir = snapshot_download(
repo_id="fbzu/rl_btc_v4_iql",
repo_type="model",
token=os.environ.get("HF_TOKEN"),
allow_patterns=["rl_btc_v4/*"],
)
sys.path.insert(0, code_dir)
from rl_btc_v4.dataset import build_offline_rl_dataset
from rl_btc_v4.iql_trainer import IQLTrainer, IQLConfig
from rl_btc_v4.constants import N_ACTIONS
# Trackio monitoring
import trackio
run = trackio.init(project="rl_btc_v4_iql", name="iql_training_v4_v2", space_id="fbzu/trackio-rl-btc-v4")
# ── Training config ──────────────────────────────────────────────────────────
device = "cuda" if torch.cuda.is_available() else "cpu"
# Dataset with softmax behavioral policy (temperature=1.0 for diverse actions)
train_dataset, test_dataset = build_offline_rl_dataset(
data_path=data_path,
history_length=30,
episode_span_days=30,
episode_stride_days=15,
risk_lambda=1.0,
soft_dd_penalty=0.50,
behavioral_temperature=1.0, # softmax temperature for behavioral policy
test_fraction=0.2,
seed=42,
)
print(f"[v4 IQL v2] Train transitions: {train_dataset.n_transitions}")
print(f"[v4 IQL v2] Test transitions: {test_dataset.n_transitions}")
print(f"[v4 IQL v2] State dim: {train_dataset.states.shape[1]}")
print(f"[v4 IQL v2] Train action dist: {dict(zip(*np.unique(train_dataset.actions, return_counts=True)))}")
print(f"[v4 IQL v2] Positive reward fraction: {(train_dataset.rewards > 0).mean():.3f}")
state_dim = train_dataset.states.shape[1]
config = IQLConfig(
hidden_dim=256,
num_layers=2,
dropout=0.1,
expectile=0.7,
temperature=3.0,
gamma=0.99,
tau=0.005,
learning_rate=3e-4,
batch_size=512,
num_epochs=100,
weight_decay=1e-4,
policy_update_freq=2, # TD3-style delayed updates
early_stopping_patience=20, # stop if no improvement for 20 evals
eval_freq=5,
device=device,
seed=42,
)
trainer = IQLTrainer(state_dim=state_dim, action_dim=N_ACTIONS, config=config)
t_start = time.time()
def progress_fn(epoch, metrics):
trackio.log({
"epoch": epoch,
"q_loss": metrics["q_loss"],
"v_loss": metrics["v_loss"],
"policy_loss": metrics["policy_loss"],
"advantage": metrics["advantage"],
})
if epoch % config.eval_freq == 0 or epoch == config.num_epochs - 1:
elapsed = time.time() - t_start
print(f" [{elapsed:.0f}s] Epoch {epoch}: "
f"Q={metrics['q_loss']:.4f} V={metrics['v_loss']:.4f} "
f"Ο€={metrics['policy_loss']:.4f} Adv={metrics['advantage']:.4f}")
result = trainer.train(
states=train_dataset.states,
actions=train_dataset.actions,
rewards=train_dataset.rewards,
next_states=train_dataset.next_states,
dones=train_dataset.dones,
eval_states=test_dataset.states,
eval_rewards=test_dataset.rewards,
eval_dones=test_dataset.dones,
progress_fn=progress_fn,
)
t_elapsed = time.time() - t_start
print(f"\n[v4 IQL v2] Training complete in {t_elapsed:.1f}s")
print(f"[v4 IQL v2] Final metrics: {result['final_metrics']}")
print(f"[v4 IQL v2] Best eval reward: {result['best_eval_reward']}")
if result.get('final_eval'):
print(f"[v4 IQL v2] Final eval: {result['final_eval']}")
# ── Save and upload to HF Hub ─────────────────────────────────────────────────
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
# Save model
trainer.save(tmp_path)
# Save normalization stats
np.savez(
tmp_path / "scaler.npz",
mean=train_dataset.mean,
std=train_dataset.std,
reward_mean=result["reward_mean"],
reward_std=result["reward_std"],
)
# Save report
report = {
"algorithm": "IQL",
"version": "v2",
"config": config.__dict__,
"dataset": {
"path": "fbzu/btc_updown_5m_augmented_v1",
"history_length": 30,
"episode_span_days": 30,
"episode_stride_days": 15,
"risk_lambda": 1.0,
"soft_dd_penalty": 0.50,
"behavioral_temperature": 1.0,
},
"results": result,
"training_time_seconds": t_elapsed,
"device": device,
}
(tmp_path / "train_report.json").write_text(json.dumps(report, indent=2))
print("\n[v4 IQL v2] Uploading trained model to HF Hub...")
from huggingface_hub import HfApi
api = HfApi(token=os.environ.get("HF_TOKEN"))
for f in tmp_path.iterdir():
api.upload_file(
path_or_fileobj=str(f),
path_in_repo=f.name,
repo_id="fbzu/rl_btc_v4_iql",
repo_type="model",
)
print(f" Uploaded {f.name}")
print(f"\n[v4 IQL v2] Model uploaded to https://huggingface.co/fbzu/rl_btc_v4_iql")
# Final trackio metrics
trackio.log({
"training_time_seconds": t_elapsed,
"final_q_loss": result["final_metrics"]["q_loss"],
"final_v_loss": result["final_metrics"]["v_loss"],
"final_policy_loss": result["final_metrics"]["policy_loss"],
"final_mean_advantage": result["final_metrics"]["mean_advantage"],
"best_eval_reward": result["best_eval_reward"],
})
trackio.finish()