diff --git "a/version 20 pytorch.ipynb" "b/version 20 pytorch.ipynb"
new file mode 100644--- /dev/null
+++ "b/version 20 pytorch.ipynb"
@@ -0,0 +1,4029 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:00:50.617647Z",
+     "iopub.status.busy": "2025-12-01T09:00:50.617372Z",
+     "iopub.status.idle": "2025-12-01T09:01:15.410846Z",
+     "shell.execute_reply": "2025-12-01T09:01:15.410130Z",
+     "shell.execute_reply.started": "2025-12-01T09:00:50.617625Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.0: SETUP - INSTALL LIBRARIES + CONFIGURE PYTORCH\n",
+    "# ============================================================================\n",
+    "\n",
+    "# Install technical analysis library\n",
+    "!pip install -q ta\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" ENVIRONMENT SETUP\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# Set default dtype to float32\n",
+    "torch.set_default_dtype(torch.float32)\n",
+    "\n",
+    "print(f\"✅ PyTorch: {torch.__version__}\")\n",
+    "print(f\"✅ Default dtype: {torch.get_default_dtype()}\")\n",
+    "\n",
+    "# GPU detection\n",
+    "if torch.cuda.is_available():\n",
+    "    num_gpus = torch.cuda.device_count()\n",
+    "    print(f\"✅ GPUs detected: {num_gpus}\")\n",
+    "    for i in range(num_gpus):\n",
+    "        print(f\"   GPU {i}: {torch.cuda.get_device_name(i)}\")\n",
+    "    device = torch.device(\"cuda:0\")\n",
+    "else:\n",
+    "    print(\"✅ No GPU detected, using CPU\")\n",
+    "    device = torch.device(\"cpu\")\n",
+    "\n",
+    "print(f\"\\n✅ Using device: {device}\")\n",
+    "print(\"\\n✅ Setup complete - ready to build SAC agent\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:01:15.412845Z",
+     "iopub.status.busy": "2025-12-01T09:01:15.412350Z",
+     "iopub.status.idle": "2025-12-01T09:01:15.641978Z",
+     "shell.execute_reply": "2025-12-01T09:01:15.640745Z",
+     "shell.execute_reply.started": "2025-12-01T09:01:15.412790Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.01: GPU SETUP (RUN THIS FIRST!) - 5-TIER GPU OPTIMIZATION\n",
+    "# ============================================================================\n",
+    "\n",
+    "import torch\n",
+    "import torch.multiprocessing as mp\n",
+    "import os\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" GPU INITIALIZATION FOR PARALLEL TRAINING - 5-TIER OPTIMIZED\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# ============================================================================\n",
+    "# TIER 1: Setup multiprocessing with CUDA-compatible spawn method\n",
+    "# ============================================================================\n",
+    "try:\n",
+    "    mp.set_start_method('spawn', force=True)\n",
+    "    print(\"✅ Multiprocessing: 'spawn' method set for CUDA compatibility\")\n",
+    "except RuntimeError:\n",
+    "    print(\"⚠️  Multiprocessing start method already set\")\n",
+    "\n",
+    "# Configure GPU\n",
+    "if torch.cuda.is_available():\n",
+    "    num_gpus = torch.cuda.device_count()\n",
+    "    print(f\"\\n✅ Configured {num_gpus} GPU(s):\")\n",
+    "    for i in range(num_gpus):\n",
+    "        print(f\"   GPU {i}: {torch.cuda.get_device_name(i)}\")\n",
+    "        mem_info = torch.cuda.get_device_properties(i)\n",
+    "        print(f\"      Total Memory: {mem_info.total_memory / 1e9:.2f} GB\")\n",
+    "    \n",
+    "    # Set default device\n",
+    "    device = torch.device(\"cuda:0\")\n",
+    "    torch.cuda.set_device(device)\n",
+    "    \n",
+    "    # Enable cuDNN benchmarking for faster training\n",
+    "    torch.backends.cudnn.benchmark = True\n",
+    "    torch.backends.cudnn.enabled = True\n",
+    "    \n",
+    "    # ============================================================================\n",
+    "    # TIER 2: Enable TF32 for faster matrix operations (40-70% speedup on Ampere+)\n",
+    "    # ============================================================================\n",
+    "    torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "    torch.backends.cudnn.allow_tf32 = True\n",
+    "    print(f\"\\n✅ TF32 Matmul: Enabled (40-70% speedup on Ampere GPUs)\")\n",
+    "    \n",
+    "    # Enable flash attention if available (PyTorch 2.0+)\n",
+    "    if hasattr(torch.backends.cuda, 'enable_flash_sdp'):\n",
+    "        torch.backends.cuda.enable_flash_sdp(True)\n",
+    "        print(f\"✅ Flash Attention: Enabled\")\n",
+    "    \n",
+    "    print(f\"\\n✅ Default device set to: {device}\")\n",
+    "    print(f\"✅ cuDNN benchmark: Enabled\")\n",
+    "else:\n",
+    "    device = torch.device(\"cpu\")\n",
+    "    print(\"❌ No GPUs found! Using CPU\")\n",
+    "\n",
+    "# Check PyTorch version for torch.compile support\n",
+    "pytorch_version = torch.__version__\n",
+    "print(f\"\\n📦 PyTorch Version: {pytorch_version}\")\n",
+    "if int(pytorch_version.split('.')[0]) >= 2:\n",
+    "    print(\"✅ torch.compile available (PyTorch 2.0+)\")\n",
+    "    TORCH_COMPILE_AVAILABLE = True\n",
+    "else:\n",
+    "    print(\"⚠️  torch.compile requires PyTorch 2.0+ (will use fallback)\")\n",
+    "    TORCH_COMPILE_AVAILABLE = False\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\" 🚀 5-TIER GPU OPTIMIZATION ENABLED:\")\n",
+    "print(\"=\"*70)\n",
+    "print(\" TIER 1: Multiprocessing with 'spawn' (replaces threading)\")\n",
+    "print(\" TIER 2: torch.compile + TF32 (40-70% speedup)\")\n",
+    "print(\" TIER 3: GPU-accelerated environments\")\n",
+    "print(\" TIER 4: Vectorized environments (batched rollouts)\")\n",
+    "print(\" TIER 5: Async replay buffer pre-sampling\")\n",
+    "print(\"=\"*70)\n",
+    "print(\"\\n⚠️  IMPORTANT: Run this cell FIRST, then run other cells\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:01:15.643906Z",
+     "iopub.status.busy": "2025-12-01T09:01:15.643388Z",
+     "iopub.status.idle": "2025-12-01T09:01:23.936289Z",
+     "shell.execute_reply": "2025-12-01T09:01:23.935499Z",
+     "shell.execute_reply.started": "2025-12-01T09:01:15.643874Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0: ADVANCED SETUP WITH REAL FEAR & GREED INDEX\n",
+    "# ============================================================================\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import gym\n",
+    "from gym import spaces\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from ta.momentum import RSIIndicator, StochasticOscillator, ROCIndicator, WilliamsRIndicator\n",
+    "from ta.trend import MACD, EMAIndicator, SMAIndicator, ADXIndicator, CCIIndicator\n",
+    "from ta.volatility import BollingerBands, AverageTrueRange, KeltnerChannel\n",
+    "from ta.volume import OnBalanceVolumeIndicator, ChaikinMoneyFlowIndicator\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" ADVANCED SAC SETUP - 56+ FEATURES + REAL FEAR & GREED\")\n",
+    "print(\"=\"*70)\n",
+    "print(\"Loading Bitcoin 15-min + Fear & Greed Index data...\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# 1. LOAD BITCOIN 15-MIN DATA\n",
+    "# ============================================================================\n",
+    "data_path = '/kaggle/input/bitcoin-historical-datasets-2018-2024/'\n",
+    "btc_data = pd.read_csv(data_path + 'btc_15m_data_2018_to_2025.csv')\n",
+    "\n",
+    "column_mapping = {'Open time': 'timestamp', 'Open': 'open', 'High': 'high', \n",
+    "                 'Low': 'low', 'Close': 'close', 'Volume': 'volume'}\n",
+    "btc_data = btc_data.rename(columns=column_mapping)\n",
+    "btc_data['timestamp'] = pd.to_datetime(btc_data['timestamp'])\n",
+    "btc_data.set_index('timestamp', inplace=True)\n",
+    "btc_data = btc_data[['open', 'high', 'low', 'close', 'volume']]\n",
+    "\n",
+    "for col in btc_data.columns:\n",
+    "    btc_data[col] = pd.to_numeric(btc_data[col], errors='coerce')\n",
+    "\n",
+    "btc_data = btc_data[btc_data.index >= '2021-01-01']\n",
+    "btc_data = btc_data[~btc_data.index.duplicated(keep='first')]\n",
+    "btc_data = btc_data.replace(0, np.nan).dropna().sort_index()\n",
+    "\n",
+    "print(f\"✅ BTC Data: {len(btc_data):,} 15-min candles\")\n",
+    "print(f\"   Date range: {btc_data.index[0]} to {btc_data.index[-1]}\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# 2. LOAD FEAR & GREED INDEX - TRY BOTH DATASETS PROPERLY\n",
+    "# ============================================================================\n",
+    "fgi_loaded = False\n",
+    "\n",
+    "# TRY DATASET 1: metalgrey (4H OHLC with daily FGI)\n",
+    "try:\n",
+    "    print(\"\\n🔍 Trying Fear & Greed Dataset 1 (metalgrey)...\")\n",
+    "    fgi_path1 = '/kaggle/input/btc-usdt-4h-ohlc-fgi-daily-2020/'\n",
+    "    \n",
+    "    # List files to see what's available\n",
+    "    import os\n",
+    "    files_in_path = os.listdir(fgi_path1)\n",
+    "    print(f\"   Files found: {files_in_path}\")\n",
+    "    \n",
+    "    # Try common filenames\n",
+    "    for filename in ['btc_usdt_4h_ohlc_fgi_daily.csv', 'data.csv', 'bitcoin_fear_greed.csv']:\n",
+    "        try:\n",
+    "            fgi_data = pd.read_csv(fgi_path1 + filename)\n",
+    "            print(f\"   ✅ Loaded: {filename}\")\n",
+    "            print(f\"   Columns: {list(fgi_data.columns)}\")\n",
+    "            \n",
+    "            # Handle different column names\n",
+    "            if 'timestamp' in fgi_data.columns:\n",
+    "                fgi_data['timestamp'] = pd.to_datetime(fgi_data['timestamp'])\n",
+    "            elif 'date' in fgi_data.columns:\n",
+    "                fgi_data['timestamp'] = pd.to_datetime(fgi_data['date'])\n",
+    "            elif 'time' in fgi_data.columns:\n",
+    "                fgi_data['timestamp'] = pd.to_datetime(fgi_data['time'])\n",
+    "            else:\n",
+    "                fgi_data['timestamp'] = pd.to_datetime(fgi_data.iloc[:, 0])\n",
+    "            \n",
+    "            fgi_data.set_index('timestamp', inplace=True)\n",
+    "            \n",
+    "            # Find FGI column\n",
+    "            if 'fear_greed_index' in fgi_data.columns:\n",
+    "                fgi_data = fgi_data[['fear_greed_index']].rename(columns={'fear_greed_index': 'fgi'})\n",
+    "            elif 'fgi' in fgi_data.columns:\n",
+    "                fgi_data = fgi_data[['fgi']]\n",
+    "            elif 'value' in fgi_data.columns:\n",
+    "                fgi_data = fgi_data[['value']].rename(columns={'value': 'fgi'})\n",
+    "            else:\n",
+    "                # Use first numeric column\n",
+    "                fgi_data = fgi_data.iloc[:, 0:1].rename(columns={fgi_data.columns[0]: 'fgi'})\n",
+    "            \n",
+    "            print(f\"   ✅ Fear & Greed loaded: {len(fgi_data):,} values\")\n",
+    "            print(f\"   Date range: {fgi_data.index[0]} to {fgi_data.index[-1]}\")\n",
+    "            fgi_loaded = True\n",
+    "            break\n",
+    "        except Exception as e:\n",
+    "            continue\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f\"   ❌ Dataset 1 failed: {e}\")\n",
+    "\n",
+    "# TRY DATASET 2: wlwwwlw (Bitcoin Pulse)\n",
+    "if not fgi_loaded:\n",
+    "    try:\n",
+    "        print(\"\\n🔍 Trying Fear & Greed Dataset 2 (wlwwwlw)...\")\n",
+    "        fgi_path2 = '/kaggle/input/bitcoin-pulse-market-trends-and-fear-dataset/'\n",
+    "        \n",
+    "        files_in_path = os.listdir(fgi_path2)\n",
+    "        print(f\"   Files found: {files_in_path}\")\n",
+    "        \n",
+    "        for filename in ['bitcoin_fear_greed.csv', 'fear_greed.csv', 'data.csv']:\n",
+    "            try:\n",
+    "                fgi_data = pd.read_csv(fgi_path2 + filename)\n",
+    "                print(f\"   ✅ Loaded: {filename}\")\n",
+    "                print(f\"   Columns: {list(fgi_data.columns)}\")\n",
+    "                \n",
+    "                # Handle timestamp\n",
+    "                if 'timestamp' in fgi_data.columns:\n",
+    "                    fgi_data['timestamp'] = pd.to_datetime(fgi_data['timestamp'])\n",
+    "                elif 'date' in fgi_data.columns:\n",
+    "                    fgi_data['timestamp'] = pd.to_datetime(fgi_data['date'])\n",
+    "                else:\n",
+    "                    fgi_data['timestamp'] = pd.to_datetime(fgi_data.iloc[:, 0])\n",
+    "                \n",
+    "                fgi_data.set_index('timestamp', inplace=True)\n",
+    "                \n",
+    "                # Find FGI column\n",
+    "                if 'value' in fgi_data.columns:\n",
+    "                    fgi_data = fgi_data[['value']].rename(columns={'value': 'fgi'})\n",
+    "                elif 'fear_greed_index' in fgi_data.columns:\n",
+    "                    fgi_data = fgi_data[['fear_greed_index']].rename(columns={'fear_greed_index': 'fgi'})\n",
+    "                elif 'fgi' in fgi_data.columns:\n",
+    "                    fgi_data = fgi_data[['fgi']]\n",
+    "                else:\n",
+    "                    fgi_data = fgi_data.iloc[:, 1:2].rename(columns={fgi_data.columns[1]: 'fgi'})\n",
+    "                \n",
+    "                print(f\"   ✅ Fear & Greed loaded: {len(fgi_data):,} values\")\n",
+    "                print(f\"   Date range: {fgi_data.index[0]} to {fgi_data.index[-1]}\")\n",
+    "                fgi_loaded = True\n",
+    "                break\n",
+    "            except Exception as e:\n",
+    "                continue\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"   ❌ Dataset 2 failed: {e}\")\n",
+    "\n",
+    "# FALLBACK: Create dummy values if both failed\n",
+    "if not fgi_loaded:\n",
+    "    print(\"\\n⚠️  Both datasets failed, creating neutral dummy values\")\n",
+    "    fgi_data = pd.DataFrame(index=btc_data.index)\n",
+    "    fgi_data['fgi'] = 50  # Neutral\n",
+    "\n",
+    "# ============================================================================\n",
+    "# 3. MERGE FEAR & GREED WITH BTC DATA (PROPER TIMESTAMP MATCHING)\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔗 Merging Fear & Greed with Bitcoin data...\")\n",
+    "\n",
+    "# Merge with forward fill (FGI is daily, BTC is 15-min)\n",
+    "btc_data = btc_data.join(fgi_data, how='left')\n",
+    "\n",
+    "# Forward fill missing values (daily FGI → 15-min intervals)\n",
+    "btc_data['fgi'] = btc_data['fgi'].fillna(method='ffill')\n",
+    "\n",
+    "# Backward fill for any remaining NaN at start\n",
+    "btc_data['fgi'] = btc_data['fgi'].fillna(method='bfill')\n",
+    "\n",
+    "# If still NaN, use neutral value\n",
+    "btc_data['fgi'] = btc_data['fgi'].fillna(50)\n",
+    "\n",
+    "print(f\"✅ Merged: {len(btc_data):,} candles with Fear & Greed\")\n",
+    "\n",
+    "# Verify FGI has variation (not all 50)\n",
+    "fgi_unique = btc_data['fgi'].nunique()\n",
+    "fgi_mean = btc_data['fgi'].mean()\n",
+    "fgi_std = btc_data['fgi'].std()\n",
+    "\n",
+    "print(f\"\\n📊 Fear & Greed Index stats:\")\n",
+    "print(f\"   Unique values: {fgi_unique}\")\n",
+    "print(f\"   Mean: {fgi_mean:.1f}\")\n",
+    "print(f\"   Std: {fgi_std:.1f}\")\n",
+    "print(f\"   Min/Max: {btc_data['fgi'].min():.0f} / {btc_data['fgi'].max():.0f}\")\n",
+    "\n",
+    "if fgi_unique == 1:\n",
+    "    print(\"   ⚠️  WARNING: FGI is constant (dummy values)\")\n",
+    "    print(\"   Will still train, but missing 5-10% potential improvement\")\n",
+    "else:\n",
+    "    print(\"   ✅ FGI has variation - good!\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# 4. CALCULATE 54+ TECHNICAL INDICATORS (SAME AS BEFORE)\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔧 Calculating 54+ technical indicators...\")\n",
+    "data = btc_data.copy()\n",
+    "\n",
+    "# Momentum (10 features)\n",
+    "print(\"   📊 Momentum...\")\n",
+    "data['rsi_14'] = RSIIndicator(close=data['close'], window=14).rsi() / 100\n",
+    "data['rsi_7'] = RSIIndicator(close=data['close'], window=7).rsi() / 100\n",
+    "data['rsi_21'] = RSIIndicator(close=data['close'], window=21).rsi() / 100\n",
+    "\n",
+    "stoch = StochasticOscillator(high=data['high'], low=data['low'], close=data['close'], window=14)\n",
+    "data['stoch_k'] = stoch.stoch() / 100\n",
+    "data['stoch_d'] = stoch.stoch_signal() / 100\n",
+    "\n",
+    "roc = ROCIndicator(close=data['close'], window=12)\n",
+    "data['roc_12'] = np.tanh(roc.roc() / 100)\n",
+    "\n",
+    "williams = WilliamsRIndicator(high=data['high'], low=data['low'], close=data['close'], lbp=14)\n",
+    "data['williams_r'] = (williams.williams_r() + 100) / 100\n",
+    "\n",
+    "macd = MACD(close=data['close'], window_slow=26, window_fast=12, window_sign=9)\n",
+    "data['macd'] = np.tanh(macd.macd() / data['close'] * 100)\n",
+    "data['macd_signal'] = np.tanh(macd.macd_signal() / data['close'] * 100)\n",
+    "data['macd_diff'] = np.tanh(macd.macd_diff() / data['close'] * 100)\n",
+    "\n",
+    "# Trend (12 features)\n",
+    "print(\"   📈 Trend...\")\n",
+    "data['sma_20'] = SMAIndicator(close=data['close'], window=20).sma_indicator()\n",
+    "data['sma_50'] = SMAIndicator(close=data['close'], window=50).sma_indicator()\n",
+    "data['sma_200'] = SMAIndicator(close=data['close'], window=200).sma_indicator()\n",
+    "data['ema_12'] = EMAIndicator(close=data['close'], window=12).ema_indicator()\n",
+    "data['ema_26'] = EMAIndicator(close=data['close'], window=26).ema_indicator()\n",
+    "\n",
+    "data['price_vs_sma20'] = (data['close'] - data['sma_20']) / data['sma_20']\n",
+    "data['price_vs_sma50'] = (data['close'] - data['sma_50']) / data['sma_50']\n",
+    "data['price_vs_sma200'] = (data['close'] - data['sma_200']) / data['sma_200']\n",
+    "\n",
+    "adx = ADXIndicator(high=data['high'], low=data['low'], close=data['close'], window=14)\n",
+    "data['adx'] = adx.adx() / 100\n",
+    "data['adx_pos'] = adx.adx_pos() / 100\n",
+    "data['adx_neg'] = adx.adx_neg() / 100\n",
+    "\n",
+    "cci = CCIIndicator(high=data['high'], low=data['low'], close=data['close'], window=20)\n",
+    "data['cci'] = np.tanh(cci.cci() / 100)\n",
+    "\n",
+    "# Volatility (7 features)\n",
+    "print(\"   💥 Volatility...\")\n",
+    "bb = BollingerBands(close=data['close'], window=20, window_dev=2)\n",
+    "data['bb_high'] = bb.bollinger_hband()\n",
+    "data['bb_low'] = bb.bollinger_lband()\n",
+    "data['bb_mid'] = bb.bollinger_mavg()\n",
+    "data['bb_width'] = (data['bb_high'] - data['bb_low']) / data['bb_mid']\n",
+    "data['bb_position'] = (data['close'] - data['bb_low']) / (data['bb_high'] - data['bb_low'])\n",
+    "\n",
+    "atr = AverageTrueRange(high=data['high'], low=data['low'], close=data['close'], window=14)\n",
+    "data['atr'] = atr.average_true_range()\n",
+    "data['atr_percent'] = data['atr'] / data['close']\n",
+    "\n",
+    "# Volume (5 features)\n",
+    "print(\"   📦 Volume...\")\n",
+    "data['volume_ma_20'] = data['volume'].rolling(20).mean()\n",
+    "data['volume_ratio'] = data['volume'] / (data['volume_ma_20'] + 1e-8)\n",
+    "\n",
+    "obv = OnBalanceVolumeIndicator(close=data['close'], volume=data['volume'])\n",
+    "data['obv'] = obv.on_balance_volume()\n",
+    "data['obv_ema'] = data['obv'].ewm(span=20).mean()\n",
+    "data['obv_slope'] = (data['obv'] - data['obv'].shift(5)) / (data['obv'].shift(5) + 1e-8)\n",
+    "\n",
+    "# Price action (9 features)\n",
+    "print(\"   🎯 Price action...\")\n",
+    "data['returns_1'] = data['close'].pct_change()\n",
+    "data['returns_5'] = data['close'].pct_change(5)\n",
+    "data['returns_20'] = data['close'].pct_change(20)\n",
+    "\n",
+    "data['volatility_20'] = data['returns_1'].rolling(20).std()\n",
+    "data['volatility_60'] = data['returns_1'].rolling(60).std()\n",
+    "\n",
+    "data['body_size'] = abs(data['close'] - data['open']) / (data['open'] + 1e-8)\n",
+    "data['upper_wick'] = (data['high'] - data[['open', 'close']].max(axis=1)) / (data['open'] + 1e-8)\n",
+    "data['lower_wick'] = (data[['open', 'close']].min(axis=1) - data['low']) / (data['open'] + 1e-8)\n",
+    "\n",
+    "data['high_20'] = data['high'].rolling(20).max()\n",
+    "data['low_20'] = data['low'].rolling(20).min()\n",
+    "data['price_position'] = (data['close'] - data['low_20']) / (data['high_20'] - data['low_20'] + 1e-8)\n",
+    "\n",
+    "# Fear & Greed (4 features) ✅ REAL DATA NOW\n",
+    "print(\"   😨 Fear & Greed...\")\n",
+    "data['fgi_normalized'] = (data['fgi'] - 50) / 50  # [-1, 1]\n",
+    "data['fgi_change'] = data['fgi'].diff() / 50\n",
+    "data['fgi_ma7'] = data['fgi'].rolling(7).mean()\n",
+    "data['fgi_vs_ma'] = (data['fgi'] - data['fgi_ma7']) / 50\n",
+    "\n",
+    "# Time (4 features)\n",
+    "print(\"   🕐 Time...\")\n",
+    "data['hour'] = data.index.hour / 24\n",
+    "data['day_of_week'] = data.index.dayofweek / 7\n",
+    "data['is_weekend'] = (data.index.dayofweek >= 5).astype(float)\n",
+    "data['us_session'] = ((data.index.hour >= 14) & (data.index.hour < 21)).astype(float)\n",
+    "\n",
+    "btc_features = data.dropna()\n",
+    "\n",
+    "print(f\"\\n✅ Feature engineering complete!\")\n",
+    "feature_count = len([col for col in btc_features.columns if col not in ['open', 'high', 'low', 'close', 'volume']])\n",
+    "print(f\"   Total features: {feature_count} technical indicators\")\n",
+    "print(f\"   Clean data: {len(btc_features):,} candles\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# 5-7: NORMALIZATION, SPLIT, ENVIRONMENT (SAME AS BEFORE)\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔧 Normalizing features...\")\n",
+    "\n",
+    "feature_cols = [col for col in btc_features.columns \n",
+    "                if col not in ['open', 'high', 'low', 'close', 'volume']]\n",
+    "\n",
+    "print(f\"   Features to normalize: {len(feature_cols)}\")\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "btc_features[feature_cols] = scaler.fit_transform(btc_features[feature_cols])\n",
+    "btc_features[feature_cols] = btc_features[feature_cols].clip(-5, 5)\n",
+    "\n",
+    "print(f\"✅ Features normalized\")\n",
+    "\n",
+    "# Train/test split\n",
+    "train_size = int(len(btc_features) * 0.8)\n",
+    "train_data = btc_features.iloc[:train_size].copy()\n",
+    "test_data = btc_features.iloc[train_size:].copy()\n",
+    "\n",
+    "print(f\"\\n📊 Data split:\")\n",
+    "print(f\"   Train: {len(train_data):,} candles\")\n",
+    "print(f\"   Test:  {len(test_data):,} candles\")\n",
+    "\n",
+    "# Environment (same as last working version)\n",
+    "print(\"\\n🏗️  Building trading environment...\")\n",
+    "\n",
+    "class BitcoinTradingEnv(gym.Env):\n",
+    "    \"\"\"Fixed reward calculation environment\"\"\"\n",
+    "    \n",
+    "    def __init__(self, df, initial_balance=10000, episode_length=500, transaction_fee=0.001):\n",
+    "        super().__init__()\n",
+    "        self.df = df.reset_index(drop=True)\n",
+    "        self.initial_balance = initial_balance\n",
+    "        self.episode_length = episode_length\n",
+    "        self.transaction_fee = transaction_fee\n",
+    "        \n",
+    "        self.feature_cols = [col for col in df.columns \n",
+    "                            if col not in ['open', 'high', 'low', 'close', 'volume']]\n",
+    "        \n",
+    "        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)\n",
+    "        self.observation_space = spaces.Box(\n",
+    "            low=-10, high=10, \n",
+    "            shape=(len(self.feature_cols) + 5,), \n",
+    "            dtype=np.float32\n",
+    "        )\n",
+    "        \n",
+    "        self.reset()\n",
+    "    \n",
+    "    def reset(self):\n",
+    "        max_start = len(self.df) - self.episode_length - 1\n",
+    "        self.start_idx = np.random.randint(100, max_start)\n",
+    "        \n",
+    "        self.current_step = 0\n",
+    "        self.balance = self.initial_balance\n",
+    "        self.position = 0.0\n",
+    "        self.entry_price = 0.0\n",
+    "        self.total_value = self.initial_balance\n",
+    "        self.prev_total_value = self.initial_balance\n",
+    "        self.max_value = self.initial_balance\n",
+    "        self.trades = []\n",
+    "        \n",
+    "        return self._get_obs()\n",
+    "    \n",
+    "    def _get_obs(self):\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        features = self.df.loc[idx, self.feature_cols].values\n",
+    "        \n",
+    "        current_price = self.df.loc[idx, 'close']\n",
+    "        total_return = (self.total_value / self.initial_balance) - 1\n",
+    "        drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0\n",
+    "        \n",
+    "        portfolio_info = np.array([\n",
+    "            self.position,\n",
+    "            total_return,\n",
+    "            drawdown,\n",
+    "            self.df.loc[idx, 'returns_1'],\n",
+    "            self.df.loc[idx, 'rsi_14']\n",
+    "        ], dtype=np.float32)\n",
+    "        \n",
+    "        obs = np.concatenate([features, portfolio_info])\n",
+    "        return np.clip(obs, -10, 10).astype(np.float32)\n",
+    "    \n",
+    "    def step(self, action):\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        current_price = self.df.loc[idx, 'close']\n",
+    "        target_position = np.clip(action[0], -1.0, 1.0)\n",
+    "        \n",
+    "        self.prev_total_value = self.total_value\n",
+    "        \n",
+    "        if abs(target_position - self.position) > 0.1:\n",
+    "            if self.position != 0:\n",
+    "                self._close_position(current_price)\n",
+    "            if abs(target_position) > 0.1:\n",
+    "                self._open_position(target_position, current_price)\n",
+    "        \n",
+    "        self._update_total_value(current_price)\n",
+    "        self.max_value = max(self.max_value, self.total_value)\n",
+    "        \n",
+    "        self.current_step += 1\n",
+    "        done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5)\n",
+    "        \n",
+    "        reward = (self.total_value - self.prev_total_value) / self.initial_balance\n",
+    "        \n",
+    "        if abs(target_position - self.position) > 0.5:\n",
+    "            reward -= 0.0001\n",
+    "        \n",
+    "        obs = self._get_obs()\n",
+    "        info = {'total_value': self.total_value, 'position': self.position}\n",
+    "        \n",
+    "        return obs, reward, done, info\n",
+    "    \n",
+    "    def _update_total_value(self, current_price):\n",
+    "        if self.position != 0:\n",
+    "            if self.position > 0:\n",
+    "                pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1)\n",
+    "            else:\n",
+    "                pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price)\n",
+    "            self.total_value = self.balance + pnl\n",
+    "        else:\n",
+    "            self.total_value = self.balance\n",
+    "    \n",
+    "    def _open_position(self, size, price):\n",
+    "        self.position = size\n",
+    "        self.entry_price = price\n",
+    "    \n",
+    "    def _close_position(self, price):\n",
+    "        if self.position > 0:\n",
+    "            pnl = self.position * self.initial_balance * (price / self.entry_price - 1)\n",
+    "        else:\n",
+    "            pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price)\n",
+    "        \n",
+    "        pnl -= abs(pnl) * self.transaction_fee\n",
+    "        self.balance += pnl\n",
+    "        self.position = 0.0\n",
+    "\n",
+    "train_env_sac = BitcoinTradingEnv(train_data)\n",
+    "test_env_sac = BitcoinTradingEnv(test_data)\n",
+    "\n",
+    "# Test\n",
+    "print(\"\\n🧪 Testing environment...\")\n",
+    "test_state = train_env_sac.reset()\n",
+    "rewards_collected = []\n",
+    "for i in range(10):\n",
+    "    action = np.array([0.5 if i < 5 else -0.3])\n",
+    "    _, reward, _, _ = train_env_sac.step(action)\n",
+    "    rewards_collected.append(reward)\n",
+    "\n",
+    "print(f\"   Non-zero rewards: {sum([abs(r) > 1e-8 for r in rewards_collected])}/10\")\n",
+    "print(f\"   Mean reward: {np.mean(rewards_collected):.6f}\")\n",
+    "\n",
+    "if sum([abs(r) > 1e-8 for r in rewards_collected]) > 0:\n",
+    "    print(\"\\n✅ ENVIRONMENT READY!\")\n",
+    "else:\n",
+    "    print(\"\\n❌ REWARD SYSTEM BROKEN\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\" READY FOR SAC (PyTorch)\")\n",
+    "print(\"=\"*70)\n",
+    "print(f\"📊 State dimension: {train_env_sac.observation_space.shape[0]}\")\n",
+    "print(f\"📊 Features: {len(feature_cols)} (including real Fear & Greed)\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:01:23.938424Z",
+     "iopub.status.busy": "2025-12-01T09:01:23.938199Z",
+     "iopub.status.idle": "2025-12-01T09:01:24.159423Z",
+     "shell.execute_reply": "2025-12-01T09:01:24.158572Z",
+     "shell.execute_reply.started": "2025-12-01T09:01:23.938406Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# cell 0.5  FEAR & GREED INDEX LOADER WITH FORWARD FILL\n",
+    "# Complete solution for loading and broadcasting FGI to 15-min data\n",
+    "# ============================================================================\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" LOADING & BROADCASTING FEAR & GREED INDEX\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# ----------------------------------------------------------------------------\n",
+    "# STEP 1: LOAD FGI DATA (TRY MULTIPLE SOURCES)\n",
+    "# ----------------------------------------------------------------------------\n",
+    "fgi_values = None\n",
+    "\n",
+    "# TRY SOURCE 1: btc_with_fgi_4h.csv (4-hour intervals)\n",
+    "try:\n",
+    "    print(\"\\n📂 Trying: btc_with_fgi_4h.csv...\")\n",
+    "    fgi_path = '/kaggle/input/btc-usdt-4h-ohlc-fgi-daily-2020/'\n",
+    "    fgi_df = pd.read_csv(fgi_path + 'btc_with_fgi_4h.csv')\n",
+    "    \n",
+    "    # Parse timestamp and set index\n",
+    "    fgi_df['timestamp'] = pd.to_datetime(fgi_df['timestamp'])\n",
+    "    fgi_df.set_index('timestamp', inplace=True)\n",
+    "    \n",
+    "    # Extract FGI column\n",
+    "    if 'Fear & Greed Index' in fgi_df.columns:\n",
+    "        fgi_values = fgi_df[['Fear & Greed Index']].rename(\n",
+    "            columns={'Fear & Greed Index': 'fgi'}\n",
+    "        )\n",
+    "        fgi_values['fgi'] = pd.to_numeric(fgi_values['fgi'], errors='coerce')\n",
+    "        fgi_values = fgi_values.dropna()\n",
+    "        \n",
+    "        print(f\"   ✅ Loaded {len(fgi_values):,} FGI values (4-hour)\")\n",
+    "        print(f\"   Range: {fgi_values.index[0]} to {fgi_values.index[-1]}\")\n",
+    "        print(f\"   FGI: {fgi_values['fgi'].min():.0f} - {fgi_values['fgi'].max():.0f}\")\n",
+    "        \n",
+    "except Exception as e:\n",
+    "    print(f\"   ❌ Failed: {e}\")\n",
+    "\n",
+    "# TRY SOURCE 2: merged_fix_to_hour.csv (hourly intervals)\n",
+    "if fgi_values is None:\n",
+    "    try:\n",
+    "        print(\"\\n📂 Trying: merged_fix_to_hour.csv...\")\n",
+    "        fgi_path = '/kaggle/input/bitcoin-pulse-market-trends-and-fear-dataset/'\n",
+    "        fgi_df = pd.read_csv(fgi_path + 'merged_fix_to_hour.csv')\n",
+    "        \n",
+    "        fgi_df['timestamp'] = pd.to_datetime(fgi_df['Datetime'])\n",
+    "        fgi_df.set_index('timestamp', inplace=True)\n",
+    "        \n",
+    "        # Handle column name variations\n",
+    "        if 'fear_gread_index' in fgi_df.columns:\n",
+    "            fgi_values = fgi_df[['fear_gread_index']].rename(\n",
+    "                columns={'fear_gread_index': 'fgi'}\n",
+    "            )\n",
+    "        elif 'fear_greed_index' in fgi_df.columns:\n",
+    "            fgi_values = fgi_df[['fear_greed_index']].rename(\n",
+    "                columns={'fear_greed_index': 'fgi'}\n",
+    "            )\n",
+    "        \n",
+    "        if fgi_values is not None:\n",
+    "            fgi_values['fgi'] = pd.to_numeric(fgi_values['fgi'], errors='coerce')\n",
+    "            fgi_values = fgi_values.dropna()\n",
+    "            \n",
+    "            print(f\"   ✅ Loaded {len(fgi_values):,} FGI values (hourly)\")\n",
+    "            print(f\"   Range: {fgi_values.index[0]} to {fgi_values.index[-1]}\")\n",
+    "            print(f\"   FGI: {fgi_values['fgi'].min():.0f} - {fgi_values['fgi'].max():.0f}\")\n",
+    "            \n",
+    "    except Exception as e:\n",
+    "        print(f\"   ❌ Failed: {e}\")\n",
+    "\n",
+    "# FALLBACK: Create neutral dummy values if both sources fail\n",
+    "if fgi_values is None:\n",
+    "    print(\"\\n⚠️  All sources failed - using neutral FGI (50)\")\n",
+    "    fgi_values = pd.DataFrame({\n",
+    "        'fgi': [50] * len(train_data)\n",
+    "    }, index=train_data.index)\n",
+    "\n",
+    "# ----------------------------------------------------------------------------\n",
+    "# STEP 2: BROADCAST FGI TO 15-MINUTE DATA (FORWARD FILL)\n",
+    "# ----------------------------------------------------------------------------\n",
+    "print(\"\\n🔗 Broadcasting FGI to 15-minute Bitcoin data...\")\n",
+    "\n",
+    "# Remove existing FGI columns if they exist\n",
+    "fgi_cols_to_remove = ['fgi', 'fgi_normalized', 'fgi_change', 'fgi_ma7', 'fgi_vs_ma']\n",
+    "train_data = train_data.drop(columns=fgi_cols_to_remove, errors='ignore')\n",
+    "test_data = test_data.drop(columns=fgi_cols_to_remove, errors='ignore')\n",
+    "\n",
+    "# Join FGI data to train/test (left join keeps all 15-min timestamps)\n",
+    "train_data = train_data.join(fgi_values, how='left')\n",
+    "test_data = test_data.join(fgi_values, how='left')\n",
+    "\n",
+    "# Forward fill: Each 15-min candle gets the most recent FGI value\n",
+    "train_data['fgi'] = train_data['fgi'].fillna(method='ffill')\n",
+    "test_data['fgi'] = test_data['fgi'].fillna(method='ffill')\n",
+    "\n",
+    "# Backward fill for any NaN at the start\n",
+    "train_data['fgi'] = train_data['fgi'].fillna(method='bfill')\n",
+    "test_data['fgi'] = test_data['fgi'].fillna(method='bfill')\n",
+    "\n",
+    "# Final fallback: neutral value\n",
+    "train_data['fgi'] = train_data['fgi'].fillna(50)\n",
+    "test_data['fgi'] = test_data['fgi'].fillna(50)\n",
+    "\n",
+    "print(f\"✅ FGI broadcasted successfully\")\n",
+    "\n",
+    "# ----------------------------------------------------------------------------\n",
+    "# STEP 3: CREATE NORMALIZED FGI FEATURES (FOR RL AGENT)\n",
+    "# ----------------------------------------------------------------------------\n",
+    "print(\"\\n🔧 Creating normalized FGI features...\")\n",
+    "\n",
+    "for df in [train_data, test_data]:\n",
+    "    # Normalize to [-1, 1] range (agent-friendly)\n",
+    "    df['fgi_normalized'] = (df['fgi'] - 50) / 50\n",
+    "    \n",
+    "    # FGI change (momentum)\n",
+    "    df['fgi_change'] = df['fgi'].diff() / 50\n",
+    "    \n",
+    "    # 7-period moving average\n",
+    "    df['fgi_ma7'] = df['fgi'].rolling(7).mean()\n",
+    "    \n",
+    "    # Deviation from MA\n",
+    "    df['fgi_vs_ma'] = (df['fgi'] - df['fgi_ma7']) / 50\n",
+    "    \n",
+    "    # Fill NaN from rolling operations\n",
+    "    df['fgi_change'] = df['fgi_change'].fillna(0)\n",
+    "    df['fgi_ma7'] = df['fgi_ma7'].fillna(df['fgi'])\n",
+    "    df['fgi_vs_ma'] = df['fgi_vs_ma'].fillna(0)\n",
+    "    \n",
+    "    # Clip extreme values\n",
+    "    for col in ['fgi_normalized', 'fgi_change', 'fgi_vs_ma']:\n",
+    "        df[col] = df[col].clip(-5, 5)\n",
+    "\n",
+    "print(\"✅ FGI features created\")\n",
+    "\n",
+    "# ----------------------------------------------------------------------------\n",
+    "# STEP 4: REMOVE RAW FGI FROM FEATURES (ONLY KEEP NORMALIZED)\n",
+    "# ----------------------------------------------------------------------------\n",
+    "# Remove raw FGI and FGI_ma7 from feature list (RL agent should only see normalized)\n",
+    "print(\"\\n🧹 Cleaning feature columns...\")\n",
+    "\n",
+    "# Update feature columns (exclude raw OHLCV and raw FGI)\n",
+    "feature_cols = [col for col in train_data.columns \n",
+    "                if col not in ['open', 'high', 'low', 'close', 'volume', 'fgi', 'fgi_ma7']]\n",
+    "\n",
+    "print(f\"✅ Feature columns updated: {len(feature_cols)} features\")\n",
+    "print(f\"   FGI features: fgi_normalized, fgi_change, fgi_vs_ma\")\n",
+    "\n",
+    "# ----------------------------------------------------------------------------\n",
+    "# STEP 5: VERIFY DATA QUALITY\n",
+    "# ----------------------------------------------------------------------------\n",
+    "print(\"\\n📊 Fear & Greed Index Statistics:\")\n",
+    "print(f\"   Training data:\")\n",
+    "print(f\"      Unique values: {train_data['fgi'].nunique()}\")\n",
+    "print(f\"      Mean: {train_data['fgi'].mean():.1f}\")\n",
+    "print(f\"      Std: {train_data['fgi'].std():.1f}\")\n",
+    "print(f\"      Range: [{train_data['fgi'].min():.0f}, {train_data['fgi'].max():.0f}]\")\n",
+    "\n",
+    "if train_data['fgi'].nunique() > 10:\n",
+    "    print(\"\\n   ✅ REAL FGI DATA LOADED!\")\n",
+    "else:\n",
+    "    print(\"\\n   ⚠️  Low variation - likely dummy data\")\n",
+    "\n",
+    "# Sample values\n",
+    "print(\"\\n   Sample FGI over time:\")\n",
+    "sample_indices = np.linspace(0, len(train_data)-1, 5, dtype=int)\n",
+    "for idx in sample_indices:\n",
+    "    date = train_data.index[idx]\n",
+    "    fgi_val = train_data.iloc[idx]['fgi']\n",
+    "    print(f\"      {date}: FGI = {fgi_val:.0f}\")\n",
+    "\n",
+    "# ----------------------------------------------------------------------------\n",
+    "# STEP 6: RECREATE ENVIRONMENT WITH NEW STATE DIMENSION\n",
+    "# ----------------------------------------------------------------------------\n",
+    "print(\"\\n🏗️  Recreating trading environments...\")\n",
+    "\n",
+    "# Calculate new state dimension\n",
+    "state_dim = len(feature_cols) + 5  # features + portfolio state\n",
+    "print(f\"   State dimension: {state_dim}\")\n",
+    "print(f\"   Features: {len(feature_cols)}\")\n",
+    "print(f\"   Portfolio state: 5 (balance, position, entry_price, etc.)\")\n",
+    "\n",
+    "# Recreate environments\n",
+    "train_env_sac = BitcoinTradingEnv(train_data, initial_balance=10000)\n",
+    "test_env_sac = BitcoinTradingEnv(test_data, initial_balance=10000)\n",
+    "\n",
+    "# Quick test\n",
+    "test_state = train_env_sac.reset()\n",
+    "print(f\"\\n🧪 Environment test:\")\n",
+    "print(f\"   State shape: {test_state.shape}\")\n",
+    "print(f\"   State range: [{test_state.min():.3f}, {test_state.max():.3f}]\")\n",
+    "\n",
+    "# Test a few steps\n",
+    "rewards = []\n",
+    "for i in range(5):\n",
+    "    action = np.array([0.3])\n",
+    "    _, reward, _, _ = train_env_sac.step(action)\n",
+    "    rewards.append(reward)\n",
+    "\n",
+    "non_zero = sum([abs(r) > 1e-8 for r in rewards])\n",
+    "print(f\"   Non-zero rewards: {non_zero}/5\")\n",
+    "\n",
+    "if non_zero > 0:\n",
+    "    print(\"   ✅ Environment working!\")\n",
+    "else:\n",
+    "    print(\"   ⚠️  All rewards zero - check environment\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\" ✅ FGI LOADING COMPLETE\")\n",
+    "print(\"=\"*70)\n",
+    "print(f\"📊 Final state dimension: {state_dim}\")\n",
+    "print(f\"📊 FGI features: 3 normalized features (fgi_normalized, fgi_change, fgi_vs_ma)\")\n",
+    "print(f\"\\n⚠️  IMPORTANT: You must now recreate your SAC agent!\")\n",
+    "print(f\"   New state_dim = {state_dim}\")\n",
+    "print(\"\\n▶️  NEXT STEPS:\")\n",
+    "print(\"   1. Re-run Cell 1 (SAC Agent) with new state_dim\")\n",
+    "print(\"   2. Re-run Cell 2 (Replay Buffer) with new state_dim\")\n",
+    "print(\"   3. Start training\")\n",
+    "print(\"=\"*70)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:01:24.160739Z",
+     "iopub.status.busy": "2025-12-01T09:01:24.160466Z",
+     "iopub.status.idle": "2025-12-01T09:01:31.475105Z",
+     "shell.execute_reply": "2025-12-01T09:01:31.474330Z",
+     "shell.execute_reply.started": "2025-12-01T09:01:24.160720Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.6: LOAD SENTIMENT DATA - FIXED VERSION WITH CLEANUP\n",
+    "# ============================================================================\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" LOADING SENTIMENT DATA (3-HOUR BITCOIN NEWS)\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# ============================================================================\n",
+    "# CLEAN UP ANY EXISTING SENTIMENT COLUMNS (FROM PREVIOUS RUNS)\n",
+    "# ============================================================================\n",
+    "print(\"\\n🧹 Cleaning up existing sentiment columns...\")\n",
+    "\n",
+    "sentiment_cols_to_remove = [\n",
+    "    'prob_bullish', 'prob_bearish', 'prob_neutral', 'confidence',\n",
+    "    'sentiment_net', 'sentiment_strength', 'sentiment_weighted',\n",
+    "    'sentiment_change', 'sentiment_ma7', 'sentiment_volatility'\n",
+    "]\n",
+    "\n",
+    "# Remove from train_data\n",
+    "existing_in_train = [col for col in sentiment_cols_to_remove if col in train_data.columns]\n",
+    "if existing_in_train:\n",
+    "    print(f\"   Removing from train_data: {existing_in_train}\")\n",
+    "    train_data = train_data.drop(columns=existing_in_train)\n",
+    "\n",
+    "# Remove from test_data\n",
+    "existing_in_test = [col for col in sentiment_cols_to_remove if col in test_data.columns]\n",
+    "if existing_in_test:\n",
+    "    print(f\"   Removing from test_data: {existing_in_test}\")\n",
+    "    test_data = test_data.drop(columns=existing_in_test)\n",
+    "\n",
+    "print(f\"✅ Cleanup complete!\")\n",
+    "print(f\"   Train shape: {train_data.shape}\")\n",
+    "print(f\"   Test shape: {test_data.shape}\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# LOAD RAW SENTIMENT DATA\n",
+    "# ============================================================================\n",
+    "sentiment_file = '/kaggle/input/bitcoin-news-with-sentimen/bitcoin_news_3hour_intervals_with_sentiment.csv'\n",
+    "\n",
+    "print(\"\\n📂 Loading sentiment data...\")\n",
+    "sentiment_raw = pd.read_csv(sentiment_file)\n",
+    "\n",
+    "print(f\"✅ File loaded!\")\n",
+    "print(f\"   Shape: {sentiment_raw.shape}\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# PARSE TIMESTAMP - FIX THE TIME RANGE FORMAT\n",
+    "# ============================================================================\n",
+    "# Your timestamps are like \"2021-01-01 03:00-05:59\" (range format)\n",
+    "# We need just the start time: \"2021-01-01 03:00\"\n",
+    "\n",
+    "# Extract start time from range (before the hyphen)\n",
+    "def parse_time_range(time_str):\n",
+    "    \"\"\"Convert '2021-01-01 03:00-05:59' to '2021-01-01 03:00:00'\"\"\"\n",
+    "    # Split on space to get date and time parts\n",
+    "    parts = str(time_str).split(' ')\n",
+    "    if len(parts) >= 2:\n",
+    "        date = parts[0]  # '2021-01-01'\n",
+    "        time_range = parts[1]  # '03:00-05:59'\n",
+    "        start_time = time_range.split('-')[0]  # '03:00'\n",
+    "        return f\"{date} {start_time}:00\"\n",
+    "    return time_str\n",
+    "\n",
+    "sentiment_raw['timestamp'] = sentiment_raw['time_interval'].apply(parse_time_range)\n",
+    "sentiment_raw['timestamp'] = pd.to_datetime(sentiment_raw['timestamp'])\n",
+    "sentiment_raw = sentiment_raw.set_index('timestamp')\n",
+    "sentiment_raw = sentiment_raw.sort_index()\n",
+    "\n",
+    "print(f\"\\n📅 Date range: {sentiment_raw.index[0]} to {sentiment_raw.index[-1]}\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# EXTRACT SENTIMENT PROBABILITY COLUMNS\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔍 Extracting sentiment columns...\")\n",
+    "\n",
+    "sentiment_clean = pd.DataFrame(index=sentiment_raw.index)\n",
+    "sentiment_clean['prob_bullish'] = pd.to_numeric(sentiment_raw['prob_bullish'], errors='coerce')\n",
+    "sentiment_clean['prob_bearish'] = pd.to_numeric(sentiment_raw['prob_bearish'], errors='coerce')\n",
+    "sentiment_clean['prob_neutral'] = pd.to_numeric(sentiment_raw['prob_neutral'], errors='coerce')\n",
+    "sentiment_clean['confidence'] = pd.to_numeric(sentiment_raw['sentiment_confidence'], errors='coerce')\n",
+    "\n",
+    "# Drop rows with NaN\n",
+    "sentiment_clean = sentiment_clean.dropna()\n",
+    "\n",
+    "print(f\"\\n✅ Cleaned sentiment data: {len(sentiment_clean):,} records\")\n",
+    "print(f\"\\n📊 Sample values (first 3 rows):\")\n",
+    "print(sentiment_clean.head(3))\n",
+    "\n",
+    "# Verify data variation\n",
+    "print(f\"\\n📊 Data variation check:\")\n",
+    "print(f\"   Unique prob_bullish values: {sentiment_clean['prob_bullish'].nunique()}\")\n",
+    "print(f\"   Bullish range: [{sentiment_clean['prob_bullish'].min():.3f}, {sentiment_clean['prob_bullish'].max():.3f}]\")\n",
+    "print(f\"   Mean ± Std: {sentiment_clean['prob_bullish'].mean():.3f} ± {sentiment_clean['prob_bullish'].std():.3f}\")\n",
+    "\n",
+    "if sentiment_clean['prob_bullish'].nunique() < 10:\n",
+    "    print(\"\\n⚠️ WARNING: Very low variation in sentiment data!\")\n",
+    "else:\n",
+    "    print(\"\\n✅ Sentiment data has excellent variation!\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# MERGE WITH BITCOIN DATA (FORWARD FILL FROM 3H TO 15MIN)\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔗 Merging sentiment with Bitcoin data...\")\n",
+    "print(\"   Method: Forward fill (each 3h sentiment → 12 x 15min candles)\")\n",
+    "\n",
+    "# Join sentiment data\n",
+    "train_data = train_data.join(sentiment_clean, how='left')\n",
+    "test_data = test_data.join(sentiment_clean, how='left')\n",
+    "\n",
+    "print(f\"   Train shape after merge: {train_data.shape}\")\n",
+    "print(f\"   Test shape after merge: {test_data.shape}\")\n",
+    "\n",
+    "# Check NaN before forward fill\n",
+    "train_nan_before = train_data['prob_bullish'].isnull().sum()\n",
+    "test_nan_before = test_data['prob_bullish'].isnull().sum()\n",
+    "print(f\"\\n📊 NaN counts before forward fill:\")\n",
+    "print(f\"   Train: {train_nan_before:,} / {len(train_data):,} ({train_nan_before/len(train_data)*100:.1f}%)\")\n",
+    "print(f\"   Test: {test_nan_before:,} / {len(test_data):,} ({test_nan_before/len(test_data)*100:.1f}%)\")\n",
+    "\n",
+    "# Forward fill (broadcast 3-hour sentiment to 15-minute intervals)\n",
+    "for col in ['prob_bullish', 'prob_bearish', 'prob_neutral', 'confidence']:\n",
+    "    train_data[col] = train_data[col].fillna(method='ffill')\n",
+    "    test_data[col] = test_data[col].fillna(method='ffill')\n",
+    "    \n",
+    "    # Backward fill for start of data\n",
+    "    train_data[col] = train_data[col].fillna(method='bfill')\n",
+    "    test_data[col] = test_data[col].fillna(method='bfill')\n",
+    "    \n",
+    "    # Final fallback (should rarely be needed)\n",
+    "    if col == 'confidence':\n",
+    "        train_data[col] = train_data[col].fillna(0.5)\n",
+    "        test_data[col] = test_data[col].fillna(0.5)\n",
+    "    else:\n",
+    "        train_data[col] = train_data[col].fillna(0.33)\n",
+    "        test_data[col] = test_data[col].fillna(0.33)\n",
+    "\n",
+    "print(f\"\\n✅ Forward fill complete!\")\n",
+    "\n",
+    "# Verify no more NaN\n",
+    "train_nan_after = train_data['prob_bullish'].isnull().sum()\n",
+    "test_nan_after = test_data['prob_bullish'].isnull().sum()\n",
+    "print(f\"   Train NaN after: {train_nan_after}\")\n",
+    "print(f\"   Test NaN after: {test_nan_after}\")\n",
+    "\n",
+    "# Verify data quality after merge\n",
+    "print(f\"\\n📊 Sentiment stats after merge (train):\")\n",
+    "print(f\"   prob_bullish: {train_data['prob_bullish'].mean():.3f} ± {train_data['prob_bullish'].std():.3f}\")\n",
+    "print(f\"   Unique values: {train_data['prob_bullish'].nunique():,}\")\n",
+    "\n",
+    "# Show sample (should NOT be all same value)\n",
+    "print(f\"\\n📊 Sample of first 15 candles (should see repeating blocks of 12):\")\n",
+    "print(train_data[['prob_bullish', 'prob_bearish', 'confidence']].iloc[:15])\n",
+    "\n",
+    "# ============================================================================\n",
+    "# CREATE DERIVED SENTIMENT FEATURES\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔧 Creating derived sentiment features...\")\n",
+    "\n",
+    "for df in [train_data, test_data]:\n",
+    "    # 1. Net sentiment (bullish - bearish)\n",
+    "    df['sentiment_net'] = df['prob_bullish'] - df['prob_bearish']\n",
+    "    \n",
+    "    # 2. Sentiment strength (absolute difference)\n",
+    "    df['sentiment_strength'] = (df['prob_bullish'] - df['prob_bearish']).abs()\n",
+    "    \n",
+    "    # 3. Weighted sentiment (net * confidence)\n",
+    "    df['sentiment_weighted'] = df['sentiment_net'] * df['confidence']\n",
+    "    \n",
+    "    # 4. Sentiment change (first difference)\n",
+    "    df['sentiment_change'] = df['sentiment_net'].diff()\n",
+    "    \n",
+    "    # 5. Sentiment 7-period moving average\n",
+    "    df['sentiment_ma7'] = df['sentiment_net'].rolling(7, min_periods=1).mean()\n",
+    "    \n",
+    "    # 6. Sentiment volatility (20-period std)\n",
+    "    df['sentiment_volatility'] = df['sentiment_net'].rolling(20, min_periods=1).std()\n",
+    "    \n",
+    "    # Fill NaN from rolling operations\n",
+    "    df['sentiment_change'] = df['sentiment_change'].fillna(0)\n",
+    "    df['sentiment_volatility'] = df['sentiment_volatility'].fillna(0)\n",
+    "\n",
+    "print(\"✅ Derived features created!\")\n",
+    "print(\"   Raw (4): prob_bullish, prob_bearish, prob_neutral, confidence\")\n",
+    "print(\"   Derived (6): sentiment_net, sentiment_strength, sentiment_weighted,\")\n",
+    "print(\"                sentiment_change, sentiment_ma7, sentiment_volatility\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# FINAL VERIFICATION\n",
+    "# ============================================================================\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\" FINAL VERIFICATION\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "all_sentiment_cols = ['prob_bullish', 'prob_bearish', 'prob_neutral', 'confidence',\n",
+    "                      'sentiment_net', 'sentiment_strength', 'sentiment_weighted']\n",
+    "\n",
+    "print(f\"\\n📊 Sentiment statistics (train data):\")\n",
+    "for col in all_sentiment_cols:\n",
+    "    mean = train_data[col].mean()\n",
+    "    std = train_data[col].std()\n",
+    "    min_val = train_data[col].min()\n",
+    "    max_val = train_data[col].max()\n",
+    "    print(f\"   {col:25s}: {mean:7.3f} ± {std:.3f}  [{min_val:.3f}, {max_val:.3f}]\")\n",
+    "\n",
+    "# Check if data loaded successfully\n",
+    "bullish_std = train_data['prob_bullish'].std()\n",
+    "if bullish_std < 0.01:\n",
+    "    print(\"\\n❌ ERROR: Sentiment data is constant!\")\n",
+    "    print(f\"   prob_bullish std = {bullish_std:.6f} (should be > 0.1)\")\n",
+    "elif bullish_std < 0.10:\n",
+    "    print(\"\\n⚠️ WARNING: Low sentiment variation\")\n",
+    "    print(f\"   prob_bullish std = {bullish_std:.3f} (should be > 0.1)\")\n",
+    "else:\n",
+    "    print(f\"\\n✅ SUCCESS: Sentiment data loaded with real variation!\")\n",
+    "    print(f\"   prob_bullish std = {bullish_std:.3f}\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\" SENTIMENT LOADING COMPLETE\")\n",
+    "print(\"=\"*70)\n",
+    "print(f\"📊 Total sentiment features: 10 (4 raw + 6 derived)\")\n",
+    "print(f\"📊 Train data shape: {train_data.shape}\")\n",
+    "print(f\"📊 Test data shape: {test_data.shape}\")\n",
+    "print(\"\\n▶️ NOW PROCEED TO CELL 0.7 (NORMALIZATION)\")\n",
+    "print(\"=\"*70)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:01:31.476613Z",
+     "iopub.status.busy": "2025-12-01T09:01:31.475961Z",
+     "iopub.status.idle": "2025-12-01T09:01:32.570678Z",
+     "shell.execute_reply": "2025-12-01T09:01:32.569897Z",
+     "shell.execute_reply.started": "2025-12-01T09:01:31.476577Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.7: NORMALIZE ALL FEATURES (INCLUDING SENTIMENT)\n",
+    "# ============================================================================\n",
+    "\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import numpy as np\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" NORMALIZING ALL FEATURES (TECHNICAL + SENTIMENT)\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# Get all feature columns (everything except OHLCV)\n",
+    "feature_cols = [col for col in train_data.columns \n",
+    "                if col not in ['open', 'high', 'low', 'close', 'volume']]\n",
+    "\n",
+    "print(f\"\\n📊 Features to normalize: {len(feature_cols)}\")\n",
+    "\n",
+    "# Separate into groups for verification\n",
+    "technical_features = [col for col in feature_cols if 'sentiment' not in col and 'fgi' not in col]\n",
+    "sentiment_features = [col for col in feature_cols if 'sentiment' in col]\n",
+    "fgi_features = [col for col in feature_cols if 'fgi' in col and 'sentiment' not in col]\n",
+    "time_features = [col for col in feature_cols if col in ['hour', 'day_of_week', 'is_weekend', 'us_session']]\n",
+    "\n",
+    "print(f\"   Technical: {len(technical_features)}\")\n",
+    "print(f\"   Sentiment: {len(sentiment_features)}\")\n",
+    "print(f\"   Fear & Greed: {len(fgi_features)}\")\n",
+    "print(f\"   Time: {len(time_features)}\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# FIT SCALER ON TRAINING DATA ONLY (PREVENT DATA LEAKAGE)\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔧 Fitting StandardScaler on training data...\")\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "scaler.fit(train_data[feature_cols])\n",
+    "\n",
+    "print(f\"✅ Scaler fitted on {len(train_data):,} training samples\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# TRANSFORM BOTH TRAIN AND TEST DATA\n",
+    "# ============================================================================\n",
+    "print(\"\\n🔄 Transforming features...\")\n",
+    "\n",
+    "train_data[feature_cols] = scaler.transform(train_data[feature_cols])\n",
+    "test_data[feature_cols] = scaler.transform(test_data[feature_cols])\n",
+    "\n",
+    "# Clip extreme outliers to [-5, 5] (prevents numerical instability)\n",
+    "train_data[feature_cols] = train_data[feature_cols].clip(-5, 5)\n",
+    "test_data[feature_cols] = test_data[feature_cols].clip(-5, 5)\n",
+    "\n",
+    "print(f\"✅ Features normalized and clipped to [-5, 5]\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# VERIFY NORMALIZATION\n",
+    "# ============================================================================\n",
+    "print(\"\\n📊 Verification (training data):\")\n",
+    "print(f\"   Mean: {train_data[feature_cols].mean().mean():.6f} (should be ~0)\")\n",
+    "print(f\"   Std:  {train_data[feature_cols].std().mean():.6f} (should be ~1)\")\n",
+    "print(f\"   Min:  {train_data[feature_cols].min().min():.2f}\")\n",
+    "print(f\"   Max:  {train_data[feature_cols].max().max():.2f}\")\n",
+    "\n",
+    "# Show sample of sentiment features after normalization\n",
+    "print(\"\\n📊 Sentiment features after normalization (sample):\")\n",
+    "sample_sentiment = train_data[sentiment_features].iloc[1000:1003]\n",
+    "print(sample_sentiment)\n",
+    "\n",
+    "print(\"\\n✅ Normalization complete!\")\n",
+    "print(\"=\"*70)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:01:32.571926Z",
+     "iopub.status.busy": "2025-12-01T09:01:32.571468Z",
+     "iopub.status.idle": "2025-12-01T09:01:32.799955Z",
+     "shell.execute_reply": "2025-12-01T09:01:32.798999Z",
+     "shell.execute_reply.started": "2025-12-01T09:01:32.571903Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.8: TRAIN/VALID/TEST SPLIT (PROPER RL SETUP)\n",
+    "# ============================================================================\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" DATA SPLIT: TRAIN / VALID / TEST\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# Get the full dataset (before it was split)\n",
+    "# We need to go back to btc_features (after normalization in Cell 0.7)\n",
+    "full_data = pd.concat([train_data, test_data]).sort_index()\n",
+    "\n",
+    "print(f\"\\n📊 Full dataset: {len(full_data):,} candles\")\n",
+    "print(f\"   Date range: {full_data.index[0]} to {full_data.index[-1]}\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# SPLIT RATIOS (CHRONOLOGICAL)\n",
+    "# ============================================================================\n",
+    "# 70% train, 15% validation, 15% test\n",
+    "train_ratio = 0.70\n",
+    "valid_ratio = 0.15\n",
+    "test_ratio = 0.15\n",
+    "\n",
+    "train_size = int(len(full_data) * train_ratio)\n",
+    "valid_size = int(len(full_data) * valid_ratio)\n",
+    "\n",
+    "# Split chronologically (NEVER shuffle time series!)\n",
+    "train_data = full_data.iloc[:train_size].copy()\n",
+    "valid_data = full_data.iloc[train_size:train_size+valid_size].copy()\n",
+    "test_data = full_data.iloc[train_size+valid_size:].copy()\n",
+    "\n",
+    "print(f\"\\n📊 Data split:\")\n",
+    "print(f\"   Train:      {len(train_data):,} candles ({train_ratio*100:.0f}%)\")\n",
+    "print(f\"   Validation: {len(valid_data):,} candles ({valid_ratio*100:.0f}%)\")\n",
+    "print(f\"   Test:       {len(test_data):,} candles ({test_ratio*100:.0f}%)\")\n",
+    "\n",
+    "print(f\"\\n📅 Date ranges:\")\n",
+    "print(f\"   Train:      {train_data.index[0]} to {train_data.index[-1]}\")\n",
+    "print(f\"   Validation: {valid_data.index[0]} to {valid_data.index[-1]}\")\n",
+    "print(f\"   Test:       {test_data.index[0]} to {test_data.index[-1]}\")\n",
+    "\n",
+    "# Verify no overlap\n",
+    "assert train_data.index[-1] < valid_data.index[0], \"❌ Train/Valid overlap!\"\n",
+    "assert valid_data.index[-1] < test_data.index[0], \"❌ Valid/Test overlap!\"\n",
+    "print(\"\\n✅ No data leakage - all splits are chronologically separated\")\n",
+    "\n",
+    "# Verify all have sentiment features\n",
+    "sentiment_cols = [col for col in train_data.columns if 'sentiment' in col or 'prob_' in col]\n",
+    "print(f\"\\n✅ Sentiment features in all splits: {len(sentiment_cols)}\")\n",
+    "\n",
+    "print(\"=\"*70)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-01T09:01:32.801070Z",
+     "iopub.status.busy": "2025-12-01T09:01:32.800784Z",
+     "iopub.status.idle": "2025-12-01T09:01:32.854777Z",
+     "shell.execute_reply": "2025-12-01T09:01:32.854224Z",
+     "shell.execute_reply.started": "2025-12-01T09:01:32.801048Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.10: V13 TRADING ENVIRONMENT\n",
+    "# ============================================================================\n",
+    "\n",
+    "import gym\n",
+    "from gym import spaces\n",
+    "import numpy as np\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" V13 TRADING ENVIRONMENT - SIMPLE RETURN + DOMAIN RANDOMIZATION\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "class V13TradingEnv(gym.Env):\n",
+    "    \"\"\"\n",
+    "    V13 Environment Features:\n",
+    "    - Simple return reward: (V_t - V_{t-1}) / initial_balance\n",
+    "    - Domain randomization: Variable fees, episode length\n",
+    "    - No holding penalty (clean signal)\n",
+    "    - Realistic transaction costs\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, df, initial_balance=10000, base_episode_length=500, \n",
+    "                 base_transaction_fee=0.001, domain_randomization=True):\n",
+    "        super().__init__()\n",
+    "        self.df = df.reset_index(drop=True)\n",
+    "        self.initial_balance = initial_balance\n",
+    "        self.base_episode_length = base_episode_length\n",
+    "        self.base_transaction_fee = base_transaction_fee\n",
+    "        self.domain_randomization = domain_randomization\n",
+    "        \n",
+    "        # Extract feature columns (everything except OHLCV)\n",
+    "        self.feature_cols = [col for col in df.columns \n",
+    "                            if col not in ['open', 'high', 'low', 'close', 'volume']]\n",
+    "        \n",
+    "        # Action space: continuous [-1, 1] (short to long)\n",
+    "        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)\n",
+    "        \n",
+    "        # Observation space: features + portfolio info\n",
+    "        self.observation_space = spaces.Box(\n",
+    "            low=-10, high=10, \n",
+    "            shape=(len(self.feature_cols) + 5,), \n",
+    "            dtype=np.float32\n",
+    "        )\n",
+    "        \n",
+    "        self.reset()\n",
+    "    \n",
+    "    def reset(self):\n",
+    "        \"\"\"Reset environment with domain randomization\"\"\"\n",
+    "        \n",
+    "        # Domain Randomization (if enabled)\n",
+    "        if self.domain_randomization:\n",
+    "            # Randomize episode length (±10%)\n",
+    "            self.episode_length = np.random.randint(\n",
+    "                int(self.base_episode_length * 0.9),\n",
+    "                int(self.base_episode_length * 1.1)\n",
+    "            )\n",
+    "            \n",
+    "            # Randomize transaction fee (0.07% - 0.12%)\n",
+    "            self.transaction_fee = np.random.uniform(0.0007, 0.0012)\n",
+    "        else:\n",
+    "            self.episode_length = self.base_episode_length\n",
+    "            self.transaction_fee = self.base_transaction_fee\n",
+    "        \n",
+    "        # Random start position (avoid first/last 100 candles)\n",
+    "        max_start = len(self.df) - self.episode_length - 100\n",
+    "        self.start_idx = np.random.randint(100, max_start)\n",
+    "        \n",
+    "        # Initialize portfolio state\n",
+    "        self.current_step = 0\n",
+    "        self.balance = self.initial_balance\n",
+    "        self.position = 0.0  # -1 (full short) to +1 (full long)\n",
+    "        self.entry_price = 0.0\n",
+    "        self.total_value = self.initial_balance\n",
+    "        self.prev_total_value = self.initial_balance\n",
+    "        self.max_value = self.initial_balance\n",
+    "        \n",
+    "        return self._get_obs()\n",
+    "    \n",
+    "    def _get_obs(self):\n",
+    "        \"\"\"Get current observation (features + portfolio info)\"\"\"\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        \n",
+    "        # Market features (normalized technical indicators)\n",
+    "        features = self.df.loc[idx, self.feature_cols].values\n",
+    "        \n",
+    "        # Portfolio information\n",
+    "        current_price = self.df.loc[idx, 'close']\n",
+    "        total_return = (self.total_value / self.initial_balance) - 1\n",
+    "        drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0\n",
+    "        \n",
+    "        portfolio_info = np.array([\n",
+    "            self.position,                                          # Current position\n",
+    "            total_return,                                           # Total return\n",
+    "            drawdown,                                               # Drawdown\n",
+    "            self.df.loc[idx, 'returns_1'] if 'returns_1' in self.df.columns else 0,  # Market momentum\n",
+    "            self.df.loc[idx, 'rsi_14'] if 'rsi_14' in self.df.columns else 0.5       # Overbought/oversold\n",
+    "        ], dtype=np.float32)\n",
+    "        \n",
+    "        # Concatenate and clip\n",
+    "        obs = np.concatenate([features, portfolio_info])\n",
+    "        return np.clip(obs, -10, 10).astype(np.float32)\n",
+    "    \n",
+    "    def step(self, action):\n",
+    "        \"\"\"Execute one step\"\"\"\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        current_price = self.df.loc[idx, 'close']\n",
+    "        target_position = np.clip(action[0], -1.0, 1.0)\n",
+    "        \n",
+    "        # Store previous value for reward calculation\n",
+    "        self.prev_total_value = self.total_value\n",
+    "        \n",
+    "        # Execute position changes\n",
+    "        position_change = abs(target_position - self.position)\n",
+    "        if position_change > 0.1:  # Only trade if significant change\n",
+    "            if self.position != 0:\n",
+    "                self._close_position(current_price)\n",
+    "            if abs(target_position) > 0.1:\n",
+    "                self._open_position(target_position, current_price)\n",
+    "        \n",
+    "        # Update portfolio value\n",
+    "        self._update_total_value(current_price)\n",
+    "        self.max_value = max(self.max_value, self.total_value)\n",
+    "        \n",
+    "        # Advance step\n",
+    "        self.current_step += 1\n",
+    "        done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5)\n",
+    "        \n",
+    "        # V13 REWARD: SIMPLE RETURN\n",
+    "        reward = (self.total_value - self.prev_total_value) / self.initial_balance\n",
+    "        \n",
+    "        # Get next observation\n",
+    "        obs = self._get_obs()\n",
+    "        \n",
+    "        # Info for logging\n",
+    "        info = {\n",
+    "            'total_value': self.total_value,\n",
+    "            'position': self.position,\n",
+    "            'episode_length': self.episode_length,\n",
+    "            'transaction_fee': self.transaction_fee\n",
+    "        }\n",
+    "        \n",
+    "        return obs, reward, done, info\n",
+    "    \n",
+    "    def _update_total_value(self, current_price):\n",
+    "        \"\"\"Update total portfolio value (balance + unrealized PnL)\"\"\"\n",
+    "        if self.position != 0:\n",
+    "            if self.position > 0:  # Long position\n",
+    "                pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1)\n",
+    "            else:  # Short position\n",
+    "                pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price)\n",
+    "            self.total_value = self.balance + pnl\n",
+    "        else:\n",
+    "            self.total_value = self.balance\n",
+    "    \n",
+    "    def _open_position(self, size, price):\n",
+    "        \"\"\"Open new position\"\"\"\n",
+    "        self.position = size\n",
+    "        self.entry_price = price\n",
+    "    \n",
+    "    def _close_position(self, price):\n",
+    "        \"\"\"Close current position and realize PnL\"\"\"\n",
+    "        if self.position > 0:  # Close long\n",
+    "            pnl = self.position * self.initial_balance * (price / self.entry_price - 1)\n",
+    "        else:  # Close short\n",
+    "            pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price)\n",
+    "        \n",
+    "        # Apply transaction fee to PnL\n",
+    "        pnl -= abs(pnl) * self.transaction_fee\n",
+    "        \n",
+    "        # Update balance\n",
+    "        self.balance += pnl\n",
+    "        self.position = 0.0\n",
+    "\n",
+    "# ============================================================================\n",
+    "# CREATE ENVIRONMENTS (Train, Valid, Test)\n",
+    "# ============================================================================\n",
+    "print(\"\\n🏗️ Creating V13 environments...\")\n",
+    "\n",
+    "train_env_sac = V13TradingEnv(\n",
+    "    train_data, \n",
+    "    initial_balance=10000,\n",
+    "    base_episode_length=500,\n",
+    "    base_transaction_fee=0.001,\n",
+    "    domain_randomization=True  # Enabled for training\n",
+    ")\n",
+    "\n",
+    "valid_env_sac = V13TradingEnv(\n",
+    "    valid_data, \n",
+    "    initial_balance=10000,\n",
+    "    base_episode_length=500,\n",
+    "    base_transaction_fee=0.001,\n",
+    "    domain_randomization=False  # Disabled for consistent validation\n",
+    ")\n",
+    "\n",
+    "test_env_sac = V13TradingEnv(\n",
+    "    test_data, \n",
+    "    initial_balance=10000,\n",
+    "    base_episode_length=500,\n",
+    "    base_transaction_fee=0.001,\n",
+    "    domain_randomization=False  # Disabled for consistent testing\n",
+    ")\n",
+    "\n",
+    "print(f\"✅ Train environment: {len(train_data):,} candles (Domain Rand: ON)\")\n",
+    "print(f\"✅ Valid environment: {len(valid_data):,} candles (Domain Rand: OFF)\")\n",
+    "print(f\"✅ Test environment:  {len(test_data):,} candles (Domain Rand: OFF)\")\n",
+    "print(f\"✅ State dimension: {train_env_sac.observation_space.shape[0]}\")\n",
+    "\n",
+    "print(\"\\n🎯 V13 Environment Configuration:\")\n",
+    "print(\"   Reward: Simple Return = (V_t - V_{t-1}) / $10,000\")\n",
+    "print(\"   Base Episode Length: 500 steps (450-550 with randomization)\")\n",
+    "print(\"   Base Transaction Fee: 0.10% (0.07%-0.12% with randomization)\")\n",
+    "print(\"   Domain Randomization: Training ONLY (for robustness)\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\" V13 ENVIRONMENTS READY\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.11: MULTI-AGENT TRADING ENVIRONMENTS (4 AGENTS: 2 REWARD TYPES)\n",
+    "# TIER 3: GPU-ACCELERATED ENVIRONMENTS\n",
+    "# ============================================================================\n",
+    "\n",
+    "import gym\n",
+    "from gym import spaces\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" MULTI-AGENT ENVIRONMENTS - TIER 3: GPU ACCELERATED\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# ============================================================================\n",
+    "# GPU ENVIRONMENT BASE CLASS - Stores data on GPU for fast access\n",
+    "# ============================================================================\n",
+    "\n",
+    "class GPUTensorCache:\n",
+    "    \"\"\"\n",
+    "    Caches DataFrame columns as GPU tensors for fast environment access.\n",
+    "    Eliminates CPU DataFrame indexing overhead during training.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, df, device='cuda:0'):\n",
+    "        self.device = torch.device(device) if isinstance(device, str) else device\n",
+    "        \n",
+    "        # Identify feature columns\n",
+    "        self.feature_cols = [col for col in df.columns \n",
+    "                            if col not in ['open', 'high', 'low', 'close', 'volume']]\n",
+    "        \n",
+    "        # Pre-compute column indices\n",
+    "        self.feature_indices = [df.columns.get_loc(col) for col in self.feature_cols]\n",
+    "        \n",
+    "        # Convert entire DataFrame to GPU tensor (read-only data)\n",
+    "        self.data_gpu = torch.from_numpy(df.values.astype(np.float32)).to(self.device)\n",
+    "        \n",
+    "        # Cache specific columns for fast access\n",
+    "        self.close_prices = torch.from_numpy(df['close'].values.astype(np.float32)).to(self.device)\n",
+    "        \n",
+    "        if 'returns_1' in df.columns:\n",
+    "            self.returns_1 = torch.from_numpy(df['returns_1'].values.astype(np.float32)).to(self.device)\n",
+    "        else:\n",
+    "            self.returns_1 = torch.zeros(len(df), device=self.device)\n",
+    "        \n",
+    "        if 'rsi_14' in df.columns:\n",
+    "            self.rsi_14 = torch.from_numpy(df['rsi_14'].values.astype(np.float32)).to(self.device)\n",
+    "        else:\n",
+    "            self.rsi_14 = torch.full((len(df),), 0.5, device=self.device)\n",
+    "        \n",
+    "        # Feature tensor for fast slicing\n",
+    "        self.features_gpu = self.data_gpu[:, self.feature_indices]\n",
+    "        \n",
+    "        print(f\"   ✓ Cached {len(df):,} rows, {len(self.feature_cols)} features on {self.device}\")\n",
+    "\n",
+    "\n",
+    "# ============================================================================\n",
+    "# ENVIRONMENT 1: SIMPLE RETURN REWARD + HOLDING PENALTY (GPU OPTIMIZED)\n",
+    "# ============================================================================\n",
+    "\n",
+    "class SimpleReturnEnv(gym.Env):\n",
+    "    \"\"\"\n",
+    "    Environment with Simple Return Reward + INACTIVITY Penalty - GPU OPTIMIZED\n",
+    "    Reward: (V_t - V_{t-1}) / initial_balance - inactivity_penalty (when flat)\n",
+    "    Used by: Agent 1 & Agent 2\n",
+    "    \n",
+    "    TIER 3 Optimization: Data cached on GPU, torch operations for obs\n",
+    "    NOTE: Penalty applied when NOT trading (position=0), not when holding!\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, df, initial_balance=10000, base_episode_length=500, \n",
+    "                 base_transaction_fee=0.001, inactivity_penalty=0.0005, \n",
+    "                 domain_randomization=True, device='cuda:0'):\n",
+    "        super().__init__()\n",
+    "        self.df = df.reset_index(drop=True)\n",
+    "        self.initial_balance = initial_balance\n",
+    "        self.base_episode_length = base_episode_length\n",
+    "        self.base_transaction_fee = base_transaction_fee\n",
+    "        self.inactivity_penalty = inactivity_penalty  # Penalty for NOT trading\n",
+    "        self.domain_randomization = domain_randomization\n",
+    "        \n",
+    "        # TIER 3: GPU tensor cache\n",
+    "        self.device = torch.device(device) if torch.cuda.is_available() else torch.device('cpu')\n",
+    "        self.gpu_cache = GPUTensorCache(df, self.device)\n",
+    "        \n",
+    "        self.feature_cols = self.gpu_cache.feature_cols\n",
+    "        \n",
+    "        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)\n",
+    "        self.observation_space = spaces.Box(\n",
+    "            low=-10, high=10, \n",
+    "            shape=(len(self.feature_cols) + 5,), \n",
+    "            dtype=np.float32\n",
+    "        )\n",
+    "        self.reset()\n",
+    "    \n",
+    "    def reset(self):\n",
+    "        if self.domain_randomization:\n",
+    "            self.episode_length = np.random.randint(\n",
+    "                int(self.base_episode_length * 0.9),\n",
+    "                int(self.base_episode_length * 1.1)\n",
+    "            )\n",
+    "            # Only randomize fee if base fee > 0\n",
+    "            if self.base_transaction_fee > 0:\n",
+    "                self.transaction_fee = np.random.uniform(0.0007, 0.0012)\n",
+    "            else:\n",
+    "                self.transaction_fee = 0.0  # Keep zero fee\n",
+    "        else:\n",
+    "            self.episode_length = self.base_episode_length\n",
+    "            self.transaction_fee = self.base_transaction_fee\n",
+    "        \n",
+    "        max_start = len(self.df) - self.episode_length - 100\n",
+    "        self.start_idx = np.random.randint(100, max_start)\n",
+    "        \n",
+    "        self.current_step = 0\n",
+    "        self.balance = self.initial_balance\n",
+    "        self.position = 0.0\n",
+    "        self.entry_price = 0.0\n",
+    "        self.total_value = self.initial_balance\n",
+    "        self.prev_total_value = self.initial_balance\n",
+    "        self.max_value = self.initial_balance\n",
+    "        self.trade_count = 0  # Track number of trades\n",
+    "        \n",
+    "        return self._get_obs()\n",
+    "    \n",
+    "    def _get_obs(self):\n",
+    "        \"\"\"TIER 3: Get observation using GPU tensors\"\"\"\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        \n",
+    "        # Fast GPU tensor indexing\n",
+    "        features = self.gpu_cache.features_gpu[idx]\n",
+    "        \n",
+    "        # Portfolio info (computed on GPU)\n",
+    "        total_return = (self.total_value / self.initial_balance) - 1\n",
+    "        drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0\n",
+    "        \n",
+    "        portfolio_info = torch.tensor([\n",
+    "            self.position,\n",
+    "            total_return,\n",
+    "            drawdown,\n",
+    "            self.gpu_cache.returns_1[idx].item(),\n",
+    "            self.gpu_cache.rsi_14[idx].item()\n",
+    "        ], device=self.device, dtype=torch.float32)\n",
+    "        \n",
+    "        obs = torch.cat([features, portfolio_info])\n",
+    "        obs = torch.clamp(obs, -10, 10)\n",
+    "        \n",
+    "        # Return CPU numpy for gym compatibility\n",
+    "        return obs.cpu().numpy()\n",
+    "    \n",
+    "    def _get_price(self, idx):\n",
+    "        \"\"\"Fast GPU price lookup\"\"\"\n",
+    "        return self.gpu_cache.close_prices[idx].item()\n",
+    "    \n",
+    "    def step(self, action):\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        current_price = self._get_price(idx)\n",
+    "        target_position = np.clip(action[0], -1.0, 1.0)\n",
+    "        \n",
+    "        self.prev_total_value = self.total_value\n",
+    "        traded = False\n",
+    "        \n",
+    "        position_change = abs(target_position - self.position)\n",
+    "        if position_change > 0.1:\n",
+    "            traded = True\n",
+    "            self.trade_count += 1  # Count this trade\n",
+    "            if self.position != 0:\n",
+    "                self._close_position(current_price)\n",
+    "            if abs(target_position) > 0.1:\n",
+    "                self._open_position(target_position, current_price)\n",
+    "        \n",
+    "        self._update_total_value(current_price)\n",
+    "        self.max_value = max(self.max_value, self.total_value)\n",
+    "        \n",
+    "        self.current_step += 1\n",
+    "        done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5)\n",
+    "        \n",
+    "        # SIMPLE RETURN REWARD + INACTIVITY PENALTY\n",
+    "        reward = (self.total_value - self.prev_total_value) / self.initial_balance\n",
+    "        \n",
+    "        # INVERTED: Penalize INACTIVITY (position=0), NOT holding!\n",
+    "        if abs(self.position) < 0.1:  # Flat/no position = inactive\n",
+    "            reward -= self.inactivity_penalty\n",
+    "        \n",
+    "        obs = self._get_obs()\n",
+    "        info = {\n",
+    "            'total_value': self.total_value,\n",
+    "            'position': self.position,\n",
+    "            'reward_type': 'simple_return',\n",
+    "            'trade_count': self.trade_count,\n",
+    "            'inactivity_penalty': self.inactivity_penalty if abs(self.position) < 0.1 else 0\n",
+    "        }\n",
+    "        \n",
+    "        return obs, reward, done, info\n",
+    "    \n",
+    "    def _update_total_value(self, current_price):\n",
+    "        if self.position != 0:\n",
+    "            if self.position > 0:\n",
+    "                pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1)\n",
+    "            else:\n",
+    "                pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price)\n",
+    "            self.total_value = self.balance + pnl\n",
+    "        else:\n",
+    "            self.total_value = self.balance\n",
+    "    \n",
+    "    def _open_position(self, size, price):\n",
+    "        self.position = size\n",
+    "        self.entry_price = price\n",
+    "    \n",
+    "    def _close_position(self, price):\n",
+    "        if self.position > 0:\n",
+    "            pnl = self.position * self.initial_balance * (price / self.entry_price - 1)\n",
+    "        else:\n",
+    "            pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price)\n",
+    "        pnl -= abs(pnl) * self.transaction_fee\n",
+    "        self.balance += pnl\n",
+    "        self.position = 0.0\n",
+    "\n",
+    "\n",
+    "# ============================================================================\n",
+    "# ENVIRONMENT 2: PORTFOLIO LOG RETURN REWARD + INACTIVITY PENALTY (GPU OPTIMIZED)\n",
+    "# ============================================================================\n",
+    "\n",
+    "class LogReturnEnv(gym.Env):\n",
+    "    \"\"\"\n",
+    "    Environment with Portfolio Log Return Reward + INACTIVITY Penalty - GPU OPTIMIZED\n",
+    "    Reward: log(V_t / V_{t-1}) - cost_t - inactivity_penalty (when flat)\n",
+    "    Used by: Agent 3 & Agent 4\n",
+    "    \n",
+    "    TIER 3 Optimization: Data cached on GPU, torch operations for obs\n",
+    "    NOTE: Penalty applied when NOT trading (position=0), not when holding!\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, df, initial_balance=10000, base_episode_length=500, \n",
+    "                 base_transaction_fee=0.001, slippage=0.0005, inactivity_penalty=0.0005,\n",
+    "                 domain_randomization=True, device='cuda:0'):\n",
+    "        super().__init__()\n",
+    "        self.df = df.reset_index(drop=True)\n",
+    "        self.initial_balance = initial_balance\n",
+    "        self.base_episode_length = base_episode_length\n",
+    "        self.base_transaction_fee = base_transaction_fee\n",
+    "        self.slippage = slippage\n",
+    "        self.inactivity_penalty = inactivity_penalty  # Penalty for NOT trading\n",
+    "        self.domain_randomization = domain_randomization\n",
+    "        \n",
+    "        # TIER 3: GPU tensor cache\n",
+    "        self.device = torch.device(device) if torch.cuda.is_available() else torch.device('cpu')\n",
+    "        self.gpu_cache = GPUTensorCache(df, self.device)\n",
+    "        \n",
+    "        self.feature_cols = self.gpu_cache.feature_cols\n",
+    "        \n",
+    "        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)\n",
+    "        self.observation_space = spaces.Box(\n",
+    "            low=-10, high=10, \n",
+    "            shape=(len(self.feature_cols) + 5,), \n",
+    "            dtype=np.float32\n",
+    "        )\n",
+    "        self.reset()\n",
+    "    \n",
+    "    def reset(self):\n",
+    "        if self.domain_randomization:\n",
+    "            self.episode_length = np.random.randint(\n",
+    "                int(self.base_episode_length * 0.9),\n",
+    "                int(self.base_episode_length * 1.1)\n",
+    "            )\n",
+    "            # Only randomize fee if base fee > 0\n",
+    "            if self.base_transaction_fee > 0:\n",
+    "                self.transaction_fee = np.random.uniform(0.0007, 0.0012)\n",
+    "            else:\n",
+    "                self.transaction_fee = 0.0  # Keep zero fee\n",
+    "        else:\n",
+    "            self.episode_length = self.base_episode_length\n",
+    "            self.transaction_fee = self.base_transaction_fee\n",
+    "        \n",
+    "        max_start = len(self.df) - self.episode_length - 100\n",
+    "        self.start_idx = np.random.randint(100, max_start)\n",
+    "        \n",
+    "        self.current_step = 0\n",
+    "        self.balance = self.initial_balance\n",
+    "        self.position = 0.0\n",
+    "        self.entry_price = 0.0\n",
+    "        self.total_value = self.initial_balance\n",
+    "        self.prev_total_value = self.initial_balance\n",
+    "        self.max_value = self.initial_balance\n",
+    "        self.trade_cost = 0.0\n",
+    "        self.trade_count = 0  # Track number of trades\n",
+    "        \n",
+    "        return self._get_obs()\n",
+    "    \n",
+    "    def _get_obs(self):\n",
+    "        \"\"\"TIER 3: Get observation using GPU tensors\"\"\"\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        \n",
+    "        # Fast GPU tensor indexing\n",
+    "        features = self.gpu_cache.features_gpu[idx]\n",
+    "        \n",
+    "        # Portfolio info\n",
+    "        total_return = (self.total_value / self.initial_balance) - 1\n",
+    "        drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0\n",
+    "        \n",
+    "        portfolio_info = torch.tensor([\n",
+    "            self.position,\n",
+    "            total_return,\n",
+    "            drawdown,\n",
+    "            self.gpu_cache.returns_1[idx].item(),\n",
+    "            self.gpu_cache.rsi_14[idx].item()\n",
+    "        ], device=self.device, dtype=torch.float32)\n",
+    "        \n",
+    "        obs = torch.cat([features, portfolio_info])\n",
+    "        obs = torch.clamp(obs, -10, 10)\n",
+    "        \n",
+    "        return obs.cpu().numpy()\n",
+    "    \n",
+    "    def _get_price(self, idx):\n",
+    "        \"\"\"Fast GPU price lookup\"\"\"\n",
+    "        return self.gpu_cache.close_prices[idx].item()\n",
+    "    \n",
+    "    def step(self, action):\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        current_price = self._get_price(idx)\n",
+    "        target_position = np.clip(action[0], -1.0, 1.0)\n",
+    "        \n",
+    "        self.prev_total_value = self.total_value\n",
+    "        self.trade_cost = 0.0\n",
+    "        \n",
+    "        position_change = abs(target_position - self.position)\n",
+    "        if position_change > 0.1:\n",
+    "            self.trade_count += 1  # Count this trade\n",
+    "            if self.position != 0:\n",
+    "                self._close_position(current_price)\n",
+    "            if abs(target_position) > 0.1:\n",
+    "                self._open_position(target_position, current_price)\n",
+    "                trade_value = abs(target_position) * self.initial_balance\n",
+    "                self.trade_cost = trade_value * (self.transaction_fee + self.slippage)\n",
+    "        \n",
+    "        self._update_total_value(current_price)\n",
+    "        self.max_value = max(self.max_value, self.total_value)\n",
+    "        \n",
+    "        self.current_step += 1\n",
+    "        done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5)\n",
+    "        \n",
+    "        # PORTFOLIO LOG RETURN REWARD\n",
+    "        if self.prev_total_value > 0 and self.total_value > 0:\n",
+    "            log_return = np.log(self.total_value / self.prev_total_value)\n",
+    "        else:\n",
+    "            log_return = 0.0\n",
+    "        \n",
+    "        cost_normalized = self.trade_cost / self.initial_balance\n",
+    "        reward = log_return - cost_normalized\n",
+    "        \n",
+    "        # INVERTED: Penalize INACTIVITY (position=0), NOT holding!\n",
+    "        if abs(self.position) < 0.1:  # Flat/no position = inactive\n",
+    "            reward -= self.inactivity_penalty\n",
+    "        \n",
+    "        obs = self._get_obs()\n",
+    "        info = {\n",
+    "            'total_value': self.total_value,\n",
+    "            'position': self.position,\n",
+    "            'reward_type': 'log_return',\n",
+    "            'log_return': log_return,\n",
+    "            'trade_cost': cost_normalized,\n",
+    "            'trade_count': self.trade_count,\n",
+    "            'inactivity_penalty': self.inactivity_penalty if abs(self.position) < 0.1 else 0\n",
+    "        }\n",
+    "        \n",
+    "        return obs, reward, done, info\n",
+    "    \n",
+    "    def _update_total_value(self, current_price):\n",
+    "        if self.position != 0:\n",
+    "            if self.position > 0:\n",
+    "                pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1)\n",
+    "            else:\n",
+    "                pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price)\n",
+    "            self.total_value = self.balance + pnl\n",
+    "        else:\n",
+    "            self.total_value = self.balance\n",
+    "    \n",
+    "    def _open_position(self, size, price):\n",
+    "        self.position = size\n",
+    "        self.entry_price = price\n",
+    "    \n",
+    "    def _close_position(self, price):\n",
+    "        if self.position > 0:\n",
+    "            pnl = self.position * self.initial_balance * (price / self.entry_price - 1)\n",
+    "        else:\n",
+    "            pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price)\n",
+    "        pnl -= abs(pnl) * self.transaction_fee\n",
+    "        self.balance += pnl\n",
+    "        self.position = 0.0\n",
+    "\n",
+    "\n",
+    "# ============================================================================\n",
+    "# ENVIRONMENT 3: VERSION 9 STYLE - SIMPLE PNL WITH CHURNING PENALTY (GPU OPTIMIZED)\n",
+    "# ============================================================================\n",
+    "\n",
+    "class V9StyleEnv(gym.Env):\n",
+    "    \"\"\"\n",
+    "    Environment matching Version 9's reward shaping - GPU OPTIMIZED\n",
+    "    \n",
+    "    REWARD FORMULA (from Version 9):\n",
+    "        reward = (V_t - V_{t-1}) / initial_balance\n",
+    "        \n",
+    "        # Small penalty ONLY for excessive position changes (>0.5)\n",
+    "        if abs(target_position - self.position) > 0.5:\n",
+    "            reward -= 0.0001  # Tiny churning penalty\n",
+    "    \n",
+    "    KEY DIFFERENCES FROM OTHER ENVS:\n",
+    "    - NO inactivity penalty (doesn't force trading)\n",
+    "    - NO holding penalty\n",
+    "    - Only penalizes EXCESSIVE trading (>0.5 position change)\n",
+    "    - Transaction fee only on position close (0.1% default)\n",
+    "    \n",
+    "    Used by: Agent that matches Version 9 behavior\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, df, initial_balance=10000, base_episode_length=500, \n",
+    "                 base_transaction_fee=0.001, churning_penalty=0.0001,\n",
+    "                 domain_randomization=True, device='cuda:0'):\n",
+    "        super().__init__()\n",
+    "        self.df = df.reset_index(drop=True)\n",
+    "        self.initial_balance = initial_balance\n",
+    "        self.base_episode_length = base_episode_length\n",
+    "        self.base_transaction_fee = base_transaction_fee\n",
+    "        self.churning_penalty = churning_penalty  # Penalty for excessive trading\n",
+    "        self.domain_randomization = domain_randomization\n",
+    "        \n",
+    "        # TIER 3: GPU tensor cache\n",
+    "        self.device = torch.device(device) if torch.cuda.is_available() else torch.device('cpu')\n",
+    "        self.gpu_cache = GPUTensorCache(df, self.device)\n",
+    "        \n",
+    "        self.feature_cols = self.gpu_cache.feature_cols\n",
+    "        \n",
+    "        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)\n",
+    "        self.observation_space = spaces.Box(\n",
+    "            low=-10, high=10, \n",
+    "            shape=(len(self.feature_cols) + 5,), \n",
+    "            dtype=np.float32\n",
+    "        )\n",
+    "        self.reset()\n",
+    "    \n",
+    "    def reset(self):\n",
+    "        if self.domain_randomization:\n",
+    "            self.episode_length = np.random.randint(\n",
+    "                int(self.base_episode_length * 0.9),\n",
+    "                int(self.base_episode_length * 1.1)\n",
+    "            )\n",
+    "            # Only randomize fee if base fee > 0\n",
+    "            if self.base_transaction_fee > 0:\n",
+    "                self.transaction_fee = np.random.uniform(0.0007, 0.0012)\n",
+    "            else:\n",
+    "                self.transaction_fee = 0.0\n",
+    "        else:\n",
+    "            self.episode_length = self.base_episode_length\n",
+    "            self.transaction_fee = self.base_transaction_fee\n",
+    "        \n",
+    "        max_start = len(self.df) - self.episode_length - 100\n",
+    "        self.start_idx = np.random.randint(100, max_start)\n",
+    "        \n",
+    "        self.current_step = 0\n",
+    "        self.balance = self.initial_balance\n",
+    "        self.position = 0.0\n",
+    "        self.entry_price = 0.0\n",
+    "        self.total_value = self.initial_balance\n",
+    "        self.prev_total_value = self.initial_balance\n",
+    "        self.max_value = self.initial_balance\n",
+    "        self.trade_count = 0\n",
+    "        \n",
+    "        return self._get_obs()\n",
+    "    \n",
+    "    def _get_obs(self):\n",
+    "        \"\"\"TIER 3: Get observation using GPU tensors\"\"\"\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        \n",
+    "        features = self.gpu_cache.features_gpu[idx]\n",
+    "        \n",
+    "        total_return = (self.total_value / self.initial_balance) - 1\n",
+    "        drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0\n",
+    "        \n",
+    "        portfolio_info = torch.tensor([\n",
+    "            self.position,\n",
+    "            total_return,\n",
+    "            drawdown,\n",
+    "            self.gpu_cache.returns_1[idx].item(),\n",
+    "            self.gpu_cache.rsi_14[idx].item()\n",
+    "        ], device=self.device, dtype=torch.float32)\n",
+    "        \n",
+    "        obs = torch.cat([features, portfolio_info])\n",
+    "        obs = torch.clamp(obs, -10, 10)\n",
+    "        \n",
+    "        return obs.cpu().numpy()\n",
+    "    \n",
+    "    def _get_price(self, idx):\n",
+    "        return self.gpu_cache.close_prices[idx].item()\n",
+    "    \n",
+    "    def step(self, action):\n",
+    "        idx = self.start_idx + self.current_step\n",
+    "        current_price = self._get_price(idx)\n",
+    "        target_position = np.clip(action[0], -1.0, 1.0)\n",
+    "        \n",
+    "        self.prev_total_value = self.total_value\n",
+    "        position_change = abs(target_position - self.position)\n",
+    "        \n",
+    "        # Execute trade if significant position change\n",
+    "        if position_change > 0.1:\n",
+    "            self.trade_count += 1\n",
+    "            if self.position != 0:\n",
+    "                self._close_position(current_price)\n",
+    "            if abs(target_position) > 0.1:\n",
+    "                self._open_position(target_position, current_price)\n",
+    "        \n",
+    "        self._update_total_value(current_price)\n",
+    "        self.max_value = max(self.max_value, self.total_value)\n",
+    "        \n",
+    "        self.current_step += 1\n",
+    "        done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5)\n",
+    "        \n",
+    "        # VERSION 9 STYLE REWARD: Simple PnL normalized\n",
+    "        reward = (self.total_value - self.prev_total_value) / self.initial_balance\n",
+    "        \n",
+    "        # Small penalty ONLY for EXCESSIVE position changes (>0.5)\n",
+    "        # This discourages churning but doesn't penalize normal trading\n",
+    "        if position_change > 0.5:\n",
+    "            reward -= self.churning_penalty\n",
+    "        \n",
+    "        obs = self._get_obs()\n",
+    "        info = {\n",
+    "            'total_value': self.total_value,\n",
+    "            'position': self.position,\n",
+    "            'reward_type': 'v9_style',\n",
+    "            'trade_count': self.trade_count,\n",
+    "            'churning_penalty': self.churning_penalty if position_change > 0.5 else 0\n",
+    "        }\n",
+    "        \n",
+    "        return obs, reward, done, info\n",
+    "    \n",
+    "    def _update_total_value(self, current_price):\n",
+    "        if self.position != 0:\n",
+    "            if self.position > 0:\n",
+    "                pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1)\n",
+    "            else:\n",
+    "                pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price)\n",
+    "            self.total_value = self.balance + pnl\n",
+    "        else:\n",
+    "            self.total_value = self.balance\n",
+    "    \n",
+    "    def _open_position(self, size, price):\n",
+    "        self.position = size\n",
+    "        self.entry_price = price\n",
+    "    \n",
+    "    def _close_position(self, price):\n",
+    "        if self.position > 0:\n",
+    "            pnl = self.position * self.initial_balance * (price / self.entry_price - 1)\n",
+    "        else:\n",
+    "            pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price)\n",
+    "        pnl -= abs(pnl) * self.transaction_fee\n",
+    "        self.balance += pnl\n",
+    "        self.position = 0.0\n",
+    "\n",
+    "\n",
+    "# ============================================================================\n",
+    "# TIER 4: VECTORIZED ENVIRONMENT WRAPPER\n",
+    "# ============================================================================\n",
+    "\n",
+    "class VectorizedEnvWrapper:\n",
+    "    \"\"\"\n",
+    "    TIER 4: Vectorized environment for batched rollouts.\n",
+    "    Runs multiple environment instances in parallel for increased throughput.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, env_class, df, num_envs=8, device='cuda:0', **env_kwargs):\n",
+    "        self.num_envs = num_envs\n",
+    "        self.device = device\n",
+    "        self.envs = [env_class(df, device=device, **env_kwargs) for _ in range(num_envs)]\n",
+    "        \n",
+    "        # Get observation and action space from first env\n",
+    "        self.observation_space = self.envs[0].observation_space\n",
+    "        self.action_space = self.envs[0].action_space\n",
+    "        self.state_dim = self.observation_space.shape[0]\n",
+    "        \n",
+    "    def reset(self):\n",
+    "        \"\"\"Reset all environments, return batched observations\"\"\"\n",
+    "        observations = np.array([env.reset() for env in self.envs])\n",
+    "        return observations  # Shape: (num_envs, state_dim)\n",
+    "    \n",
+    "    def step(self, actions):\n",
+    "        \"\"\"\n",
+    "        Step all environments with batched actions.\n",
+    "        Args:\n",
+    "            actions: np.array of shape (num_envs, action_dim)\n",
+    "        Returns:\n",
+    "            observations: (num_envs, state_dim)\n",
+    "            rewards: (num_envs,)\n",
+    "            dones: (num_envs,)\n",
+    "            infos: list of dicts\n",
+    "        \"\"\"\n",
+    "        results = [env.step(actions[i]) for i, env in enumerate(self.envs)]\n",
+    "        \n",
+    "        observations = np.array([r[0] for r in results])\n",
+    "        rewards = np.array([r[1] for r in results])\n",
+    "        dones = np.array([r[2] for r in results])\n",
+    "        infos = [r[3] for r in results]\n",
+    "        \n",
+    "        # Auto-reset done environments\n",
+    "        for i, done in enumerate(dones):\n",
+    "            if done:\n",
+    "                observations[i] = self.envs[i].reset()\n",
+    "        \n",
+    "        return observations, rewards, dones, infos\n",
+    "    \n",
+    "    def close(self):\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "print(\"✅ GPU-Optimized environment classes created:\")\n",
+    "print(\"   1. SimpleReturnEnv: (V_t - V_{t-1}) / balance - INACTIVITY_penalty\")\n",
+    "print(\"   2. LogReturnEnv: log(V_t / V_{t-1}) - costs - INACTIVITY_penalty\")\n",
+    "print(\"   3. V9StyleEnv: (V_t - V_{t-1}) / balance - tiny churning penalty (Version 9 style)\")\n",
+    "print(\"\\n📊 Reward Shaping Summary:\")\n",
+    "print(\"   SimpleReturnEnv/LogReturnEnv: Penalizes INACTIVITY (position=0)\")\n",
+    "print(\"   V9StyleEnv: Only penalizes EXCESSIVE trading (position change > 0.5)\")\n",
+    "print(\"\\n🚀 TIER 3 Optimizations:\")\n",
+    "print(\"   ✓ GPUTensorCache: DataFrame cached on GPU\")\n",
+    "print(\"   ✓ Fast tensor indexing for observations\")\n",
+    "print(\"   ✓ GPU price lookups\")\n",
+    "print(\"\\n🚀 TIER 4 Optimizations:\")\n",
+    "print(\"   ✓ VectorizedEnvWrapper: Batched rollouts\")\n",
+    "print(\"   ✓ Parallel environment stepping\")\n",
+    "print(\"\\n📊 Agent Assignment:\")\n",
+    "print(\"   Agent 1 (GPU 0): V9StyleEnv (Version 9 reward)\")\n",
+    "print(\"   Agent 2 (GPU 0): SimpleReturnEnv\")\n",
+    "print(\"   Agent 3 (GPU 1): LogReturnEnv\")\n",
+    "print(\"   Agent 4 (GPU 1): LogReturnEnv\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 0.12: MULTI-AGENT CONFIGURATION (4 AGENTS - SEPARATE PARAMETERS)\n",
+    "# ============================================================================\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" MULTI-AGENT CONFIGURATION - 4 AGENTS WITH INDIVIDUAL PARAMETERS\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# ============================================================================\n",
+    "# AGENT CONFIGURATIONS (EASILY MODIFIABLE)\n",
+    "# ============================================================================\n",
+    "\n",
+    "AGENT_CONFIGS = {\n",
+    "    # ========================================================================\n",
+    "    # AGENT 1: VERSION 9 STYLE REWARD (GPU 0) - YOUR SPECIFIED HYPERPARAMETERS\n",
+    "    # ========================================================================\n",
+    "    'agent_1': {\n",
+    "        'name': 'Agent1_V9Style',\n",
+    "        'gpu_id': 0,\n",
+    "        'reward_type': 'v9_style',  # Uses V9StyleEnv (Version 9 reward shaping)\n",
+    "        'env_params': {\n",
+    "            'initial_balance': 10000,\n",
+    "            'base_episode_length': 500,\n",
+    "            'base_transaction_fee': 0.0,     # ← NO TRANSACTION FEE\n",
+    "            'churning_penalty': 0.0001,      # ← Tiny penalty for excessive trading only\n",
+    "            'domain_randomization': True,\n",
+    "        },\n",
+    "        'agent_params': {\n",
+    "            'actor_lr': 3e-3,             # ← Your specified: high LR\n",
+    "            'critic_lr': 3e-3,            # ← Your specified: high LR\n",
+    "            'alpha_lr': 2e-4,             # ← Your specified\n",
+    "            'gamma': 0.95,                # ← Your specified\n",
+    "            'tau': 0.005,                 # ← Your specified\n",
+    "            'batch_size': 4096,           # ← Your specified: large batch\n",
+    "            'initial_alpha': 0.3,         # ← Your specified\n",
+    "            'l2_lambda': 7.5e-5,          # ← Your specified\n",
+    "            'dropout_rate': 0.15,         # ← Your specified\n",
+    "            'gradient_clip_norm': 1.0,    # ← Your specified\n",
+    "            'gradient_steps': 1,          # ← Your specified\n",
+    "            'min_alpha': 0.009,           # ← Your specified\n",
+    "            'target_entropy': -0.2,       # ← Your specified (multiplier)\n",
+    "        },\n",
+    "        'training_params': {\n",
+    "            'num_episodes': 1500,\n",
+    "            'eval_frequency': 10,\n",
+    "            'eval_episodes': 3,\n",
+    "            'warmup_steps': 5000,\n",
+    "            'seed': 42,\n",
+    "        },\n",
+    "    },\n",
+    "    \n",
+    "    # ========================================================================\n",
+    "    # AGENT 2: Simple Return Reward (GPU 0) - Different hyperparameters\n",
+    "    # ========================================================================\n",
+    "    'agent_2': {\n",
+    "        'name': 'Agent2_SimpleReturn',\n",
+    "        'gpu_id': 0,\n",
+    "        'reward_type': 'simple_return',\n",
+    "        'env_params': {\n",
+    "            'initial_balance': 10000,\n",
+    "            'base_episode_length': 500,\n",
+    "            'base_transaction_fee': 0.0,     # ← NO TRANSACTION FEE\n",
+    "            'inactivity_penalty': 0.0008,    # ← Higher penalty for not trading\n",
+    "            'domain_randomization': True,\n",
+    "        },\n",
+    "        'agent_params': {\n",
+    "            'actor_lr': 1e-4,           # ← Lower learning rate\n",
+    "            'critic_lr': 1e-4,\n",
+    "            'alpha_lr': 5e-5,\n",
+    "            'gamma': 0.98,              # ← Higher discount factor\n",
+    "            'tau': 0.003,               # ← Slower target updates\n",
+    "            'batch_size': 512,          # ← Good GPU utilization + speed\n",
+    "            'initial_alpha': 0.2,\n",
+    "            'l2_lambda': 1e-4,\n",
+    "            'dropout_rate': 0.15,       # ← More dropout\n",
+    "            'gradient_clip_norm': 0.5,\n",
+    "            'gradient_steps': 2,        # ← Balanced\n",
+    "            'min_alpha': 0.01,\n",
+    "            'target_entropy': -0.5,     # ← Different entropy target\n",
+    "        },\n",
+    "        'training_params': {\n",
+    "            'num_episodes': 1500,\n",
+    "            'eval_frequency': 10,\n",
+    "            'eval_episodes': 3,\n",
+    "            'warmup_steps': 5000,\n",
+    "            'seed': 123,  # Different seed\n",
+    "        },\n",
+    "    },\n",
+    "    \n",
+    "    # ========================================================================\n",
+    "    # AGENT 3: Log Return Reward (GPU 1)\n",
+    "    # ========================================================================\n",
+    "    'agent_3': {\n",
+    "        'name': 'Agent3_LogReturn',\n",
+    "        'gpu_id': 1,\n",
+    "        'reward_type': 'log_return',  # Uses LogReturnEnv\n",
+    "        'env_params': {\n",
+    "            'initial_balance': 10000,\n",
+    "            'base_episode_length': 500,\n",
+    "            'base_transaction_fee': 0.0,     # ← NO TRANSACTION FEE\n",
+    "            'slippage': 0.0,                 # ← NO SLIPPAGE\n",
+    "            'inactivity_penalty': 0.0005,    # ← Penalize NOT trading\n",
+    "            'domain_randomization': True,\n",
+    "        },\n",
+    "        'agent_params': {\n",
+    "            'actor_lr': 3e-4,\n",
+    "            'critic_lr': 3e-4,\n",
+    "            'alpha_lr': 1e-4,\n",
+    "            'gamma': 0.97,\n",
+    "            'tau': 0.005,\n",
+    "            'batch_size': 512,            # ← Good GPU utilization + speed\n",
+    "            'initial_alpha': 0.3,\n",
+    "            'l2_lambda': 1e-4,\n",
+    "            'dropout_rate': 0.10,\n",
+    "            'gradient_clip_norm': 1.0,\n",
+    "            'gradient_steps': 2,           # ← Balanced\n",
+    "            'min_alpha': 0.01,\n",
+    "            'target_entropy': -0.3,\n",
+    "        },\n",
+    "        'training_params': {\n",
+    "            'num_episodes': 1500,\n",
+    "            'eval_frequency': 10,\n",
+    "            'eval_episodes': 3,\n",
+    "            'warmup_steps': 5000,\n",
+    "            'seed': 456,\n",
+    "        },\n",
+    "    },\n",
+    "    \n",
+    "    # ========================================================================\n",
+    "    # AGENT 4: Log Return Reward (GPU 1) - Different hyperparameters\n",
+    "    # ========================================================================\n",
+    "    'agent_4': {\n",
+    "        'name': 'Agent4_LogReturn',\n",
+    "        'gpu_id': 1,\n",
+    "        'reward_type': 'log_return',\n",
+    "        'env_params': {\n",
+    "            'initial_balance': 10000,\n",
+    "            'base_episode_length': 600,   # ← Longer episodes\n",
+    "            'base_transaction_fee': 0.0,     # ← NO TRANSACTION FEE\n",
+    "            'slippage': 0.0,                 # ← NO SLIPPAGE\n",
+    "            'inactivity_penalty': 0.0003,    # ← Moderate penalty\n",
+    "            'domain_randomization': True,\n",
+    "        },\n",
+    "        'agent_params': {\n",
+    "            'actor_lr': 5e-4,             # ← Higher learning rate\n",
+    "            'critic_lr': 5e-4,\n",
+    "            'alpha_lr': 2e-4,\n",
+    "            'gamma': 0.95,                # ← Lower discount\n",
+    "            'tau': 0.01,                  # ← Faster target updates\n",
+    "            'batch_size': 512,            # ← Good GPU utilization + speed\n",
+    "            'initial_alpha': 0.5,         # ← Higher initial exploration\n",
+    "            'l2_lambda': 5e-5,\n",
+    "            'dropout_rate': 0.05,         # ← Less dropout\n",
+    "            'gradient_clip_norm': 2.0,\n",
+    "            'gradient_steps': 2,          # ← Balanced\n",
+    "            'min_alpha': 0.02,\n",
+    "            'target_entropy': -0.2,\n",
+    "        },\n",
+    "        'training_params': {\n",
+    "            'num_episodes': 1500,\n",
+    "            'eval_frequency': 10,\n",
+    "            'eval_episodes': 3,\n",
+    "            'warmup_steps': 5000,\n",
+    "            'seed': 789,\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "# ============================================================================\n",
+    "# PRINT CONFIGURATION SUMMARY\n",
+    "# ============================================================================\n",
+    "print(\"\\n📋 AGENT CONFIGURATION SUMMARY:\")\n",
+    "print(\"-\"*70)\n",
+    "\n",
+    "for agent_id, config in AGENT_CONFIGS.items():\n",
+    "    print(f\"\\n🤖 {config['name']}:\")\n",
+    "    print(f\"   GPU: {config['gpu_id']}\")\n",
+    "    print(f\"   Reward: {config['reward_type']}\")\n",
+    "    # V9StyleEnv uses churning_penalty, others use inactivity_penalty\n",
+    "    if config['reward_type'] == 'v9_style':\n",
+    "        print(f\"   Churning Penalty: {config['env_params'].get('churning_penalty', 0.0001)}\")\n",
+    "    else:\n",
+    "        print(f\"   Inactivity Penalty: {config['env_params'].get('inactivity_penalty', 0.0005)}\")\n",
+    "    print(f\"   Transaction Fee: {config['env_params']['base_transaction_fee']}\")\n",
+    "    print(f\"   Key params: γ={config['agent_params']['gamma']}, \"\n",
+    "          f\"α_init={config['agent_params']['initial_alpha']}, \"\n",
+    "          f\"batch={config['agent_params']['batch_size']}\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(\" ✅ All 4 agent configurations ready!\")\n",
+    "print(\"    REWARD TYPES:\")\n",
+    "print(\"    • v9_style: Penalizes EXCESSIVE trading (churning > 0.5 position change)\")\n",
+    "print(\"    • simple_return/log_return: Penalizes INACTIVITY (not being in market)\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 1.1: MULTI-AGENT SAC CLASS - MAXIMUM GPU UTILIZATION\n",
+    "# All tensor operations on GPU, minimal CPU↔GPU transfers\n",
+    "# ============================================================================\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "import torch.optim as optim\n",
+    "from torch.distributions import Normal\n",
+    "import numpy as np\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" MULTI-AGENT SAC CLASS - MAXIMUM GPU UTILIZATION\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "class MultiAgentActor(nn.Module):\n",
+    "    \"\"\"Actor network with Dropout - GPU assignable\"\"\"\n",
+    "    \n",
+    "    def __init__(self, state_dim, action_dim, l2_lambda=1e-5, dropout_rate=0.10):\n",
+    "        super(MultiAgentActor, self).__init__()\n",
+    "        \n",
+    "        self.fc1 = nn.Linear(state_dim, 512)\n",
+    "        self.fc2 = nn.Linear(512, 384)\n",
+    "        self.fc3 = nn.Linear(384, 256)\n",
+    "        \n",
+    "        self.mean_out = nn.Linear(256, action_dim)\n",
+    "        self.log_std_out = nn.Linear(256, action_dim)\n",
+    "        \n",
+    "        self.dropout = nn.Dropout(dropout_rate)\n",
+    "        self._init_weights()\n",
+    "    \n",
+    "    def _init_weights(self):\n",
+    "        for layer in [self.fc1, self.fc2, self.fc3]:\n",
+    "            nn.init.xavier_uniform_(layer.weight)\n",
+    "            nn.init.zeros_(layer.bias)\n",
+    "        nn.init.xavier_uniform_(self.mean_out.weight)\n",
+    "        nn.init.zeros_(self.mean_out.bias)\n",
+    "        nn.init.zeros_(self.log_std_out.weight)\n",
+    "        nn.init.zeros_(self.log_std_out.bias)\n",
+    "    \n",
+    "    def forward(self, state):\n",
+    "        x = F.relu(self.fc1(state))\n",
+    "        x = self.dropout(x)\n",
+    "        x = F.relu(self.fc2(x))\n",
+    "        x = self.dropout(x)\n",
+    "        x = F.relu(self.fc3(x))\n",
+    "        x = self.dropout(x)\n",
+    "        \n",
+    "        mean = self.mean_out(x)\n",
+    "        log_std = self.log_std_out(x)\n",
+    "        log_std = torch.clamp(log_std, -20, 2)\n",
+    "        \n",
+    "        return mean, log_std\n",
+    "    \n",
+    "    def sample(self, state):\n",
+    "        mean, log_std = self.forward(state)\n",
+    "        std = log_std.exp()\n",
+    "        \n",
+    "        normal = Normal(mean, std)\n",
+    "        x_t = normal.rsample()\n",
+    "        action = torch.tanh(x_t)\n",
+    "        \n",
+    "        log_prob = normal.log_prob(x_t)\n",
+    "        log_prob -= torch.log(1 - action.pow(2) + 1e-6)\n",
+    "        log_prob = log_prob.sum(dim=-1, keepdim=True)\n",
+    "        \n",
+    "        return action, log_prob, mean\n",
+    "\n",
+    "\n",
+    "class MultiAgentCritic(nn.Module):\n",
+    "    \"\"\"Critic network with Dropout - GPU assignable\"\"\"\n",
+    "    \n",
+    "    def __init__(self, state_dim, action_dim, l2_lambda=1e-5, dropout_rate=0.10):\n",
+    "        super(MultiAgentCritic, self).__init__()\n",
+    "        \n",
+    "        self.fc1 = nn.Linear(state_dim + action_dim, 512)\n",
+    "        self.fc2 = nn.Linear(512, 384)\n",
+    "        self.fc3 = nn.Linear(384, 256)\n",
+    "        self.q_out = nn.Linear(256, 1)\n",
+    "        \n",
+    "        self.dropout = nn.Dropout(dropout_rate)\n",
+    "        self._init_weights()\n",
+    "    \n",
+    "    def _init_weights(self):\n",
+    "        for layer in [self.fc1, self.fc2, self.fc3, self.q_out]:\n",
+    "            nn.init.xavier_uniform_(layer.weight)\n",
+    "            nn.init.zeros_(layer.bias)\n",
+    "    \n",
+    "    def forward(self, state, action):\n",
+    "        x = torch.cat([state, action], dim=-1)\n",
+    "        x = F.relu(self.fc1(x))\n",
+    "        x = self.dropout(x)\n",
+    "        x = F.relu(self.fc2(x))\n",
+    "        x = self.dropout(x)\n",
+    "        x = F.relu(self.fc3(x))\n",
+    "        x = self.dropout(x)\n",
+    "        q_value = self.q_out(x)\n",
+    "        return q_value\n",
+    "\n",
+    "\n",
+    "class MultiAgentSAC:\n",
+    "    \"\"\"\n",
+    "    Multi-Agent SAC - MAXIMUM GPU UTILIZATION\n",
+    "    \n",
+    "    Key optimizations for GPU usage:\n",
+    "    1. All tensor operations stay on GPU\n",
+    "    2. Minimal CPU↔GPU data transfers\n",
+    "    3. GPU-native random number generation\n",
+    "    4. Fused optimizer steps\n",
+    "    5. Persistent GPU tensors for common operations\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        agent_name,\n",
+    "        state_dim,\n",
+    "        action_dim=1,\n",
+    "        gpu_id=0,\n",
+    "        actor_lr=3e-4,\n",
+    "        critic_lr=3e-4,\n",
+    "        alpha_lr=3e-4,\n",
+    "        gamma=0.99,\n",
+    "        tau=0.005,\n",
+    "        batch_size=256,\n",
+    "        initial_alpha=0.3,\n",
+    "        l2_lambda=1e-5,\n",
+    "        dropout_rate=0.10,\n",
+    "        gradient_clip_norm=1.0,\n",
+    "        gradient_steps=1,\n",
+    "        min_alpha=0.01,\n",
+    "        target_entropy=-0.3,\n",
+    "        use_compile=False\n",
+    "    ):\n",
+    "        self.agent_name = agent_name\n",
+    "        self.state_dim = state_dim\n",
+    "        self.action_dim = action_dim\n",
+    "        self.gamma = gamma\n",
+    "        self.tau = tau\n",
+    "        self.batch_size = batch_size\n",
+    "        self.gradient_steps = gradient_steps\n",
+    "        self.l2_lambda = l2_lambda\n",
+    "        self.dropout_rate = dropout_rate\n",
+    "        self.gradient_clip_norm = gradient_clip_norm\n",
+    "        self.min_alpha = min_alpha\n",
+    "        self.target_entropy = target_entropy\n",
+    "        \n",
+    "        # Set device based on GPU ID\n",
+    "        if torch.cuda.is_available() and gpu_id < torch.cuda.device_count():\n",
+    "            self.device = torch.device(f\"cuda:{gpu_id}\")\n",
+    "            torch.cuda.set_device(self.device)\n",
+    "        else:\n",
+    "            self.device = torch.device(\"cpu\")\n",
+    "        \n",
+    "        self.gpu_id = gpu_id\n",
+    "        \n",
+    "        # Build networks on specified GPU\n",
+    "        self.actor = MultiAgentActor(state_dim, action_dim, l2_lambda, dropout_rate).to(self.device)\n",
+    "        self.critic_1 = MultiAgentCritic(state_dim, action_dim, l2_lambda, dropout_rate).to(self.device)\n",
+    "        self.critic_2 = MultiAgentCritic(state_dim, action_dim, l2_lambda, dropout_rate).to(self.device)\n",
+    "        self.target_critic_1 = MultiAgentCritic(state_dim, action_dim, l2_lambda, dropout_rate).to(self.device)\n",
+    "        self.target_critic_2 = MultiAgentCritic(state_dim, action_dim, l2_lambda, dropout_rate).to(self.device)\n",
+    "        \n",
+    "        # Copy weights to targets\n",
+    "        self.target_critic_1.load_state_dict(self.critic_1.state_dict())\n",
+    "        self.target_critic_2.load_state_dict(self.critic_2.state_dict())\n",
+    "        \n",
+    "        # Optimizers with fused=True for GPU optimization (PyTorch 2.0+)\n",
+    "        fused_available = 'fused' in torch.optim.Adam.__init__.__code__.co_varnames\n",
+    "        opt_kwargs = {'fused': True} if fused_available and self.device.type == 'cuda' else {}\n",
+    "        \n",
+    "        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr, weight_decay=l2_lambda, **opt_kwargs)\n",
+    "        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=critic_lr, weight_decay=l2_lambda, **opt_kwargs)\n",
+    "        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=critic_lr, weight_decay=l2_lambda, **opt_kwargs)\n",
+    "        \n",
+    "        # Entropy tuning\n",
+    "        self.log_alpha = torch.tensor(np.log(initial_alpha), dtype=torch.float32, \n",
+    "                                       requires_grad=True, device=self.device)\n",
+    "        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr)\n",
+    "        \n",
+    "        # Pre-allocate GPU tensors for target entropy (avoid repeated allocation)\n",
+    "        self.target_entropy_tensor = torch.tensor(target_entropy, device=self.device, dtype=torch.float32)\n",
+    "        self.gamma_tensor = torch.tensor(gamma, device=self.device, dtype=torch.float32)\n",
+    "        self.tau_tensor = torch.tensor(tau, device=self.device, dtype=torch.float32)\n",
+    "        self.one_minus_tau = torch.tensor(1.0 - tau, device=self.device, dtype=torch.float32)\n",
+    "        \n",
+    "        # CUDA stream for overlapping operations\n",
+    "        if self.device.type == 'cuda':\n",
+    "            self.compute_stream = torch.cuda.Stream(device=self.device)\n",
+    "        else:\n",
+    "            self.compute_stream = None\n",
+    "        \n",
+    "        print(f\"✅ {agent_name} created on GPU {gpu_id} (MAX GPU UTILIZATION)\")\n",
+    "        print(f\"   Device: {self.device}\")\n",
+    "        print(f\"   Actor params: {self._count_params(self.actor):,}\")\n",
+    "        if self.device.type == 'cuda':\n",
+    "            print(f\"   ✓ Fused Adam optimizer: {fused_available}\")\n",
+    "            print(f\"   ✓ Pre-allocated GPU tensors\")\n",
+    "            print(f\"   ✓ GPU-native operations\")\n",
+    "    \n",
+    "    @property\n",
+    "    def alpha(self):\n",
+    "        raw_alpha = self.log_alpha.exp().item()\n",
+    "        return max(raw_alpha, self.min_alpha)\n",
+    "    \n",
+    "    @torch.no_grad()\n",
+    "    def get_action_gpu(self, state_tensor, deterministic=False):\n",
+    "        \"\"\"\n",
+    "        GPU-NATIVE action selection - state already on GPU\n",
+    "        Returns GPU tensor, no CPU transfer\n",
+    "        \"\"\"\n",
+    "        self.actor.eval()\n",
+    "        if deterministic:\n",
+    "            mean, _ = self.actor(state_tensor)\n",
+    "            action = torch.tanh(mean)\n",
+    "        else:\n",
+    "            action, _, _ = self.actor.sample(state_tensor)\n",
+    "        self.actor.train()\n",
+    "        return action\n",
+    "    \n",
+    "    def get_action(self, state, deterministic=False):\n",
+    "        \"\"\"Get action for single state (numpy input for compatibility)\"\"\"\n",
+    "        state = torch.FloatTensor(state).unsqueeze(0).to(self.device, non_blocking=True)\n",
+    "        action = self.get_action_gpu(state, deterministic)\n",
+    "        return action.cpu().numpy()[0], None\n",
+    "    \n",
+    "    @torch.no_grad()\n",
+    "    def get_action_batch_gpu(self, states_tensor, deterministic=False):\n",
+    "        \"\"\"\n",
+    "        GPU-NATIVE batch action selection\n",
+    "        Input: GPU tensor, Output: GPU tensor\n",
+    "        \"\"\"\n",
+    "        self.actor.eval()\n",
+    "        if deterministic:\n",
+    "            mean, _ = self.actor(states_tensor)\n",
+    "            actions = torch.tanh(mean)\n",
+    "        else:\n",
+    "            actions, _, _ = self.actor.sample(states_tensor)\n",
+    "        self.actor.train()\n",
+    "        return actions\n",
+    "    \n",
+    "    def get_action_batch(self, states, deterministic=False):\n",
+    "        \"\"\"Get actions for batch (numpy input for compatibility)\"\"\"\n",
+    "        states_t = torch.FloatTensor(states).to(self.device, non_blocking=True)\n",
+    "        actions = self.get_action_batch_gpu(states_t, deterministic)\n",
+    "        return actions.cpu().numpy()\n",
+    "    \n",
+    "    def train_step(self, states, actions, rewards, next_states, dones, weights):\n",
+    "        \"\"\"\n",
+    "        Single SAC training step - ALL ON GPU\n",
+    "        No CPU transfers except final TD-errors for priority update\n",
+    "        \"\"\"\n",
+    "        # All inputs should already be on GPU from AsyncBatchSampler\n",
+    "        \n",
+    "        # Critic update - compute targets (no grad needed)\n",
+    "        with torch.no_grad():\n",
+    "            next_actions, next_log_prob, _ = self.actor.sample(next_states)\n",
+    "            q1_target = self.target_critic_1(next_states, next_actions)\n",
+    "            q2_target = self.target_critic_2(next_states, next_actions)\n",
+    "            q_target = torch.min(q1_target, q2_target) - self.log_alpha.exp() * next_log_prob\n",
+    "            target_q = rewards + self.gamma_tensor * (1 - dones) * q_target\n",
+    "        \n",
+    "        # Critic 1 loss\n",
+    "        q1 = self.critic_1(states, actions)\n",
+    "        td_errors = torch.abs(q1 - target_q).detach()\n",
+    "        critic1_loss = (weights * F.mse_loss(q1, target_q, reduction='none')).mean()\n",
+    "        \n",
+    "        self.critic_1_optimizer.zero_grad(set_to_none=True)  # Faster than zero_grad()\n",
+    "        critic1_loss.backward()\n",
+    "        torch.nn.utils.clip_grad_norm_(self.critic_1.parameters(), self.gradient_clip_norm)\n",
+    "        self.critic_1_optimizer.step()\n",
+    "        \n",
+    "        # Critic 2 loss\n",
+    "        q2 = self.critic_2(states, actions)\n",
+    "        critic2_loss = (weights * F.mse_loss(q2, target_q, reduction='none')).mean()\n",
+    "        \n",
+    "        self.critic_2_optimizer.zero_grad(set_to_none=True)\n",
+    "        critic2_loss.backward()\n",
+    "        torch.nn.utils.clip_grad_norm_(self.critic_2.parameters(), self.gradient_clip_norm)\n",
+    "        self.critic_2_optimizer.step()\n",
+    "        \n",
+    "        # Actor loss\n",
+    "        new_actions, log_prob, _ = self.actor.sample(states)\n",
+    "        q1_new = self.critic_1(states, new_actions)\n",
+    "        q2_new = self.critic_2(states, new_actions)\n",
+    "        q_new = torch.min(q1_new, q2_new)\n",
+    "        actor_loss = (self.log_alpha.exp() * log_prob - q_new).mean()\n",
+    "        \n",
+    "        self.actor_optimizer.zero_grad(set_to_none=True)\n",
+    "        actor_loss.backward()\n",
+    "        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.gradient_clip_norm)\n",
+    "        self.actor_optimizer.step()\n",
+    "        \n",
+    "        # Alpha loss\n",
+    "        alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy_tensor).detach()).mean()\n",
+    "        \n",
+    "        self.alpha_optimizer.zero_grad(set_to_none=True)\n",
+    "        alpha_loss.backward()\n",
+    "        self.alpha_optimizer.step()\n",
+    "        \n",
+    "        # Update targets using pre-allocated tensors\n",
+    "        self._soft_update_targets()\n",
+    "        \n",
+    "        return td_errors, critic1_loss.item(), critic2_loss.item(), actor_loss.item(), alpha_loss.item()\n",
+    "    \n",
+    "    @torch.no_grad()\n",
+    "    def _soft_update_targets(self):\n",
+    "        \"\"\"Soft update target networks - vectorized on GPU\"\"\"\n",
+    "        for target_param, param in zip(self.target_critic_1.parameters(), self.critic_1.parameters()):\n",
+    "            target_param.data.mul_(self.one_minus_tau).add_(param.data, alpha=self.tau)\n",
+    "        \n",
+    "        for target_param, param in zip(self.target_critic_2.parameters(), self.critic_2.parameters()):\n",
+    "            target_param.data.mul_(self.one_minus_tau).add_(param.data, alpha=self.tau)\n",
+    "    \n",
+    "    def update_targets(self):\n",
+    "        \"\"\"Alias for compatibility\"\"\"\n",
+    "        self._soft_update_targets()\n",
+    "    \n",
+    "    def _count_params(self, model):\n",
+    "        return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "    \n",
+    "    def save_weights(self, prefix):\n",
+    "        torch.save(self.actor.state_dict(), f\"{prefix}_actor.pt\")\n",
+    "        torch.save(self.critic_1.state_dict(), f\"{prefix}_critic1.pt\")\n",
+    "        torch.save(self.critic_2.state_dict(), f\"{prefix}_critic2.pt\")\n",
+    "        torch.save(self.target_critic_1.state_dict(), f\"{prefix}_target_critic1.pt\")\n",
+    "        torch.save(self.target_critic_2.state_dict(), f\"{prefix}_target_critic2.pt\")\n",
+    "        torch.save(self.log_alpha, f\"{prefix}_log_alpha.pt\")\n",
+    "        print(f\"💾 {self.agent_name} weights saved to {prefix}_*.pt\")\n",
+    "    \n",
+    "    def load_weights(self, prefix):\n",
+    "        self.actor.load_state_dict(torch.load(f\"{prefix}_actor.pt\", map_location=self.device))\n",
+    "        self.critic_1.load_state_dict(torch.load(f\"{prefix}_critic1.pt\", map_location=self.device))\n",
+    "        self.critic_2.load_state_dict(torch.load(f\"{prefix}_critic2.pt\", map_location=self.device))\n",
+    "        self.target_critic_1.load_state_dict(torch.load(f\"{prefix}_target_critic1.pt\", map_location=self.device))\n",
+    "        self.target_critic_2.load_state_dict(torch.load(f\"{prefix}_target_critic2.pt\", map_location=self.device))\n",
+    "        self.log_alpha = torch.load(f\"{prefix}_log_alpha.pt\", map_location=self.device)\n",
+    "        print(f\"📂 {self.agent_name} weights loaded from {prefix}_*.pt\")\n",
+    "\n",
+    "\n",
+    "print(\"\\n✅ MultiAgentSAC class ready! (MAX GPU UTILIZATION)\")\n",
+    "print(\"\\n🚀 GPU Optimizations:\")\n",
+    "print(\"   ✓ Fused Adam optimizer (fewer GPU kernel launches)\")\n",
+    "print(\"   ✓ Pre-allocated GPU tensors (gamma, tau, entropy)\")\n",
+    "print(\"   ✓ zero_grad(set_to_none=True) (faster memory ops)\")\n",
+    "print(\"   ✓ GPU-native action selection methods\")\n",
+    "print(\"   ✓ Vectorized soft updates with in-place ops\")\n",
+    "print(\"   ✓ Non-blocking CPU→GPU transfers\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 2: PRIORITIZED EXPERIENCE REPLAY - MAX GPU UTILIZATION\n",
+    "# Pre-allocates GPU memory, uses pinned memory, CUDA streams\n",
+    "# ============================================================================\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import threading\n",
+    "import queue\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" PRIORITIZED REPLAY BUFFER - MAX GPU UTILIZATION\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "class SumTree:\n",
+    "    \"\"\"Binary tree for efficient priority sampling\"\"\"\n",
+    "    def __init__(self, capacity):\n",
+    "        self.capacity = capacity\n",
+    "        self.tree = np.zeros(2 * capacity - 1)\n",
+    "        self.data = np.zeros(capacity, dtype=object)\n",
+    "        self.write = 0\n",
+    "        self.n_entries = 0\n",
+    "        self._lock = threading.Lock()\n",
+    "    \n",
+    "    def _propagate(self, idx, change):\n",
+    "        parent = (idx - 1) // 2\n",
+    "        self.tree[parent] += change\n",
+    "        if parent != 0:\n",
+    "            self._propagate(parent, change)\n",
+    "    \n",
+    "    def _retrieve(self, idx, s):\n",
+    "        left = 2 * idx + 1\n",
+    "        right = left + 1\n",
+    "        if left >= len(self.tree):\n",
+    "            return idx\n",
+    "        if s <= self.tree[left]:\n",
+    "            return self._retrieve(left, s)\n",
+    "        else:\n",
+    "            return self._retrieve(right, s - self.tree[left])\n",
+    "    \n",
+    "    def total(self):\n",
+    "        return self.tree[0]\n",
+    "    \n",
+    "    def add(self, priority, data):\n",
+    "        with self._lock:\n",
+    "            idx = self.write + self.capacity - 1\n",
+    "            self.data[self.write] = data\n",
+    "            self.update(idx, priority)\n",
+    "            self.write += 1\n",
+    "            if self.write >= self.capacity:\n",
+    "                self.write = 0\n",
+    "            if self.n_entries < self.capacity:\n",
+    "                self.n_entries += 1\n",
+    "    \n",
+    "    def update(self, idx, priority):\n",
+    "        change = priority - self.tree[idx]\n",
+    "        self.tree[idx] = priority\n",
+    "        self._propagate(idx, change)\n",
+    "    \n",
+    "    def get(self, s):\n",
+    "        idx = self._retrieve(0, s)\n",
+    "        data_idx = idx - self.capacity + 1\n",
+    "        return (idx, self.tree[idx], self.data[data_idx])\n",
+    "\n",
+    "\n",
+    "class PrioritizedReplayBuffer:\n",
+    "    \"\"\"\n",
+    "    Prioritized Experience Replay - MAX GPU UTILIZATION\n",
+    "    \n",
+    "    Key optimizations:\n",
+    "    1. Pre-allocated pinned memory buffers (no allocation during sampling)\n",
+    "    2. Batch numpy operations (vectorized)\n",
+    "    3. Direct GPU tensor creation\n",
+    "    4. CUDA streams for async transfer\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, capacity, alpha=0.6, beta_start=0.4, beta_frames=750000, \n",
+    "                 state_dim=None, batch_size=256):\n",
+    "        self.tree = SumTree(capacity)\n",
+    "        self.capacity = capacity\n",
+    "        self.alpha = alpha\n",
+    "        self.beta_start = beta_start\n",
+    "        self.beta_frames = beta_frames\n",
+    "        self.frame = 1\n",
+    "        self.epsilon = 1e-6\n",
+    "        self._lock = threading.Lock()\n",
+    "        \n",
+    "        # Pre-allocate numpy arrays for batch sampling (avoid allocation overhead)\n",
+    "        self.batch_size = batch_size\n",
+    "        self._batch_states = None\n",
+    "        self._batch_actions = None\n",
+    "        self._batch_rewards = None\n",
+    "        self._batch_next_states = None\n",
+    "        self._batch_dones = None\n",
+    "        self._batch_weights = None\n",
+    "        self._initialized = False\n",
+    "    \n",
+    "    def _init_batch_buffers(self, state_dim):\n",
+    "        \"\"\"Lazy initialization of batch buffers once we know state_dim\"\"\"\n",
+    "        if not self._initialized:\n",
+    "            # Use pinned memory for fast GPU transfer\n",
+    "            self._batch_states = np.zeros((self.batch_size, state_dim), dtype=np.float32)\n",
+    "            self._batch_actions = np.zeros((self.batch_size, 1), dtype=np.float32)\n",
+    "            self._batch_rewards = np.zeros((self.batch_size, 1), dtype=np.float32)\n",
+    "            self._batch_next_states = np.zeros((self.batch_size, state_dim), dtype=np.float32)\n",
+    "            self._batch_dones = np.zeros((self.batch_size, 1), dtype=np.float32)\n",
+    "            self._batch_weights = np.zeros((self.batch_size, 1), dtype=np.float32)\n",
+    "            self._initialized = True\n",
+    "    \n",
+    "    def _get_beta(self):\n",
+    "        return min(1.0, self.beta_start + self.frame * (1.0 - self.beta_start) / self.beta_frames)\n",
+    "    \n",
+    "    def add(self, state, action, reward, next_state, done):\n",
+    "        with self._lock:\n",
+    "            max_priority = np.max(self.tree.tree[-self.tree.capacity:])\n",
+    "            if max_priority == 0:\n",
+    "                max_priority = 1.0\n",
+    "            experience = (state, action, reward, next_state, done)\n",
+    "            self.tree.add(max_priority, experience)\n",
+    "    \n",
+    "    def add_batch(self, states, actions, rewards, next_states, dones):\n",
+    "        \"\"\"Batch add for vectorized environments\"\"\"\n",
+    "        for i in range(len(states)):\n",
+    "            self.add(states[i], actions[i], rewards[i], next_states[i], dones[i])\n",
+    "    \n",
+    "    def sample(self, batch_size):\n",
+    "        \"\"\"\n",
+    "        Sample batch - uses pre-allocated buffers for speed\n",
+    "        Returns pinned memory tensors\n",
+    "        \"\"\"\n",
+    "        with self._lock:\n",
+    "            batch = []\n",
+    "            indices = []\n",
+    "            priorities = []\n",
+    "            \n",
+    "            segment = self.tree.total() / batch_size\n",
+    "            beta = self._get_beta()\n",
+    "            self.frame += 1\n",
+    "            \n",
+    "            # Vectorized random number generation\n",
+    "            random_vals = np.random.uniform(0, 1, batch_size)\n",
+    "            \n",
+    "            for i in range(batch_size):\n",
+    "                a = segment * i\n",
+    "                b = segment * (i + 1)\n",
+    "                s = a + random_vals[i] * (b - a)\n",
+    "                \n",
+    "                idx, priority, data = self.tree.get(s)\n",
+    "                \n",
+    "                if data is not None:\n",
+    "                    indices.append(idx)\n",
+    "                    priorities.append(priority)\n",
+    "                    batch.append(data)\n",
+    "        \n",
+    "        # Initialize buffers if needed\n",
+    "        state_dim = len(batch[0][0])\n",
+    "        self._init_batch_buffers(state_dim)\n",
+    "        \n",
+    "        # Fast batch extraction using list comprehension and numpy\n",
+    "        actual_batch_size = len(batch)\n",
+    "        \n",
+    "        # Vectorized extraction - much faster than loop\n",
+    "        states = np.array([x[0] for x in batch], dtype=np.float32)\n",
+    "        actions = np.array([x[1] for x in batch], dtype=np.float32).reshape(-1, 1)\n",
+    "        rewards = np.array([x[2] for x in batch], dtype=np.float32).reshape(-1, 1)\n",
+    "        next_states = np.array([x[3] for x in batch], dtype=np.float32)\n",
+    "        dones = np.array([x[4] for x in batch], dtype=np.float32).reshape(-1, 1)\n",
+    "        \n",
+    "        # Vectorized importance sampling weights\n",
+    "        priorities = np.array(priorities, dtype=np.float32)\n",
+    "        sampling_probs = priorities / (self.tree.total() + 1e-8)\n",
+    "        is_weights = np.power(self.tree.n_entries * sampling_probs + 1e-8, -beta)\n",
+    "        is_weights /= (is_weights.max() + 1e-8)\n",
+    "        is_weights = is_weights.reshape(-1, 1).astype(np.float32)\n",
+    "        \n",
+    "        # Create pinned memory tensors for fast async GPU transfer\n",
+    "        states_t = torch.from_numpy(states).pin_memory()\n",
+    "        actions_t = torch.from_numpy(actions).pin_memory()\n",
+    "        rewards_t = torch.from_numpy(rewards).pin_memory()\n",
+    "        next_states_t = torch.from_numpy(next_states).pin_memory()\n",
+    "        dones_t = torch.from_numpy(dones).pin_memory()\n",
+    "        weights_t = torch.from_numpy(is_weights).pin_memory()\n",
+    "        \n",
+    "        return states_t, actions_t, rewards_t, next_states_t, dones_t, indices, weights_t\n",
+    "    \n",
+    "    def update_priorities(self, indices, td_errors):\n",
+    "        with self._lock:\n",
+    "            for idx, td_error in zip(indices, td_errors):\n",
+    "                priority = (abs(td_error) + self.epsilon) ** self.alpha\n",
+    "                self.tree.update(idx, priority)\n",
+    "    \n",
+    "    def update_priorities_batch(self, indices, td_errors_tensor):\n",
+    "        \"\"\"Batch update from GPU tensor - single CPU transfer\"\"\"\n",
+    "        td_errors_np = td_errors_tensor.detach().cpu().numpy().flatten()\n",
+    "        priorities = (np.abs(td_errors_np) + self.epsilon) ** self.alpha\n",
+    "        \n",
+    "        with self._lock:\n",
+    "            for idx, priority in zip(indices, priorities):\n",
+    "                self.tree.update(idx, priority)\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return self.tree.n_entries\n",
+    "    \n",
+    "    def is_ready(self, batch_size):\n",
+    "        return len(self) >= batch_size\n",
+    "\n",
+    "\n",
+    "# ============================================================================\n",
+    "# ASYNC BATCH SAMPLER - Pre-transfers batches to GPU in background\n",
+    "# ============================================================================\n",
+    "\n",
+    "class AsyncBatchSampler:\n",
+    "    \"\"\"\n",
+    "    Async Replay Buffer Pre-Sampling - MAX GPU UTILIZATION\n",
+    "    \n",
+    "    Key optimizations:\n",
+    "    1. Background thread pre-samples batches\n",
+    "    2. Pre-transfers to GPU using CUDA streams\n",
+    "    3. Queue-based double buffering\n",
+    "    4. Zero GPU idle time during sampling\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, replay_buffer, batch_size, device, queue_size=4):\n",
+    "        self.buffer = replay_buffer\n",
+    "        self.batch_size = batch_size\n",
+    "        self.device = device\n",
+    "        self.queue_size = queue_size\n",
+    "        \n",
+    "        # Pre-sampled batch queue\n",
+    "        self.batch_queue = queue.Queue(maxsize=queue_size)\n",
+    "        \n",
+    "        # CUDA stream for async transfer\n",
+    "        if torch.cuda.is_available() and 'cuda' in str(device):\n",
+    "            self.transfer_stream = torch.cuda.Stream(device=device)\n",
+    "        else:\n",
+    "            self.transfer_stream = None\n",
+    "        \n",
+    "        # Control flags\n",
+    "        self.running = False\n",
+    "        self.producer_thread = None\n",
+    "        \n",
+    "        # Statistics\n",
+    "        self.batches_produced = 0\n",
+    "        self.batches_consumed = 0\n",
+    "    \n",
+    "    def _producer_loop(self):\n",
+    "        \"\"\"Producer: continuously samples and pre-transfers to GPU\"\"\"\n",
+    "        while self.running:\n",
+    "            try:\n",
+    "                if self.batch_queue.full():\n",
+    "                    import time\n",
+    "                    time.sleep(0.0005)  # Reduced sleep time\n",
+    "                    continue\n",
+    "                \n",
+    "                if len(self.buffer) >= self.batch_size:\n",
+    "                    # Sample batch (returns pinned memory tensors)\n",
+    "                    batch = self.buffer.sample(self.batch_size)\n",
+    "                    states, actions, rewards, next_states, dones, indices, weights = batch\n",
+    "                    \n",
+    "                    # Pre-transfer to GPU using CUDA stream (non-blocking)\n",
+    "                    if self.transfer_stream is not None:\n",
+    "                        with torch.cuda.stream(self.transfer_stream):\n",
+    "                            gpu_batch = (\n",
+    "                                states.to(self.device, non_blocking=True),\n",
+    "                                actions.to(self.device, non_blocking=True),\n",
+    "                                rewards.to(self.device, non_blocking=True),\n",
+    "                                next_states.to(self.device, non_blocking=True),\n",
+    "                                dones.to(self.device, non_blocking=True),\n",
+    "                                indices,\n",
+    "                                weights.to(self.device, non_blocking=True)\n",
+    "                            )\n",
+    "                        # Synchronize stream to ensure transfer is complete\n",
+    "                        self.transfer_stream.synchronize()\n",
+    "                    else:\n",
+    "                        gpu_batch = (\n",
+    "                            states.to(self.device),\n",
+    "                            actions.to(self.device),\n",
+    "                            rewards.to(self.device),\n",
+    "                            next_states.to(self.device),\n",
+    "                            dones.to(self.device),\n",
+    "                            indices,\n",
+    "                            weights.to(self.device)\n",
+    "                        )\n",
+    "                    \n",
+    "                    self.batch_queue.put(gpu_batch, timeout=1.0)\n",
+    "                    self.batches_produced += 1\n",
+    "                else:\n",
+    "                    import time\n",
+    "                    time.sleep(0.005)\n",
+    "                    \n",
+    "            except queue.Full:\n",
+    "                continue\n",
+    "            except Exception as e:\n",
+    "                print(f\"AsyncBatchSampler error: {e}\")\n",
+    "                continue\n",
+    "    \n",
+    "    def start(self):\n",
+    "        if self.running:\n",
+    "            return\n",
+    "        self.running = True\n",
+    "        self.producer_thread = threading.Thread(\n",
+    "            target=self._producer_loop,\n",
+    "            daemon=True,\n",
+    "            name=\"AsyncBatchSampler\"\n",
+    "        )\n",
+    "        self.producer_thread.start()\n",
+    "        print(f\"   ✓ AsyncBatchSampler started (queue_size={self.queue_size}, CUDA stream enabled)\")\n",
+    "    \n",
+    "    def get_batch(self, timeout=5.0):\n",
+    "        \"\"\"Get pre-sampled GPU-ready batch instantly\"\"\"\n",
+    "        try:\n",
+    "            batch = self.batch_queue.get(timeout=timeout)\n",
+    "            self.batches_consumed += 1\n",
+    "            return batch\n",
+    "        except queue.Empty:\n",
+    "            # Fallback: sample directly\n",
+    "            print(\"⚠️ Queue empty, sampling directly\")\n",
+    "            batch = self.buffer.sample(self.batch_size)\n",
+    "            states, actions, rewards, next_states, dones, indices, weights = batch\n",
+    "            return (\n",
+    "                states.to(self.device),\n",
+    "                actions.to(self.device),\n",
+    "                rewards.to(self.device),\n",
+    "                next_states.to(self.device),\n",
+    "                dones.to(self.device),\n",
+    "                indices,\n",
+    "                weights.to(self.device)\n",
+    "            )\n",
+    "    \n",
+    "    def stop(self):\n",
+    "        self.running = False\n",
+    "        if self.producer_thread is not None:\n",
+    "            self.producer_thread.join(timeout=2.0)\n",
+    "        print(f\"   ✓ AsyncBatchSampler stopped (produced={self.batches_produced}, consumed={self.batches_consumed})\")\n",
+    "    \n",
+    "    def update_priorities(self, indices, td_errors_tensor):\n",
+    "        self.buffer.update_priorities_batch(indices, td_errors_tensor)\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.buffer)\n",
+    "\n",
+    "\n",
+    "print(\"\\n✅ PrioritizedReplayBuffer ready! (MAX GPU UTILIZATION)\")\n",
+    "print(\"\\n🚀 GPU Optimizations:\")\n",
+    "print(\"   ✓ Pre-allocated batch buffers\")\n",
+    "print(\"   ✓ Pinned memory for fast GPU transfer\")\n",
+    "print(\"   ✓ CUDA streams for async transfer\")\n",
+    "print(\"   ✓ Vectorized numpy operations\")\n",
+    "print(\"   ✓ Queue-based double buffering (size=4)\")\n",
+    "print(\"   ✓ Background pre-sampling thread\")\n",
+    "print(\"   ✓ Zero GPU idle time\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 4.1: MULTI-AGENT TRAINING FUNCTION - MAXIMUM GPU UTILIZATION\n",
+    "# Thread-safe version with aggressive GPU-native operations\n",
+    "# ============================================================================\n",
+    "\n",
+    "import time\n",
+    "import os\n",
+    "import threading\n",
+    "import queue\n",
+    "from collections import deque\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" MULTI-AGENT TRAINING - MAXIMUM GPU UTILIZATION\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# ============================================================================\n",
+    "# TRAINING FUNCTION FOR SINGLE AGENT - GPU NATIVE\n",
+    "# Maximum GPU usage with minimal CPU roundtrips\n",
+    "# ============================================================================\n",
+    "\n",
+    "def train_single_agent(agent_id, config, train_data, valid_data, classes_dict, result_queue=None):\n",
+    "    \"\"\"\n",
+    "    Train a single agent with MAXIMUM GPU UTILIZATION\n",
+    "    \n",
+    "    GPU Optimizations Applied:\n",
+    "    1. TF32 matmul enabled (2-3x speedup on Ampere+)\n",
+    "    2. GPU-cached environment data (5-10x env speedup)\n",
+    "    3. Async replay buffer pre-sampling (10-20% speedup)\n",
+    "    4. GPU-native action selection (no CPU roundtrip)\n",
+    "    5. Pinned memory for any necessary CPU<->GPU transfers\n",
+    "    6. CUDA streams for async operations\n",
+    "    7. Batched replay buffer additions\n",
+    "    \n",
+    "    Args:\n",
+    "        agent_id: Unique identifier for this agent\n",
+    "        config: Agent configuration dictionary\n",
+    "        train_data: Training data\n",
+    "        valid_data: Validation data\n",
+    "        classes_dict: Dictionary containing class references\n",
+    "        result_queue: Queue to put results (optional)\n",
+    "    \"\"\"\n",
+    "    import numpy as np\n",
+    "    import torch\n",
+    "    import torch.nn.functional as F\n",
+    "    import time\n",
+    "    import os\n",
+    "    \n",
+    "    # Get classes from dictionary\n",
+    "    SimpleReturnEnv_cls = classes_dict['SimpleReturnEnv']\n",
+    "    LogReturnEnv_cls = classes_dict['LogReturnEnv']\n",
+    "    V9StyleEnv_cls = classes_dict['V9StyleEnv']  # NEW: Version 9 style reward\n",
+    "    MultiAgentSAC_cls = classes_dict['MultiAgentSAC']\n",
+    "    PrioritizedReplayBuffer_cls = classes_dict['PrioritizedReplayBuffer']\n",
+    "    AsyncBatchSampler_cls = classes_dict['AsyncBatchSampler']\n",
+    "    VectorizedEnvWrapper_cls = classes_dict.get('VectorizedEnvWrapper', None)\n",
+    "    \n",
+    "    agent_name = config['name']\n",
+    "    gpu_id = config['gpu_id']\n",
+    "    reward_type = config['reward_type']\n",
+    "    env_params = config['env_params']\n",
+    "    agent_params = config['agent_params']\n",
+    "    train_params = config['training_params']\n",
+    "    \n",
+    "    # Vectorization settings\n",
+    "    use_vectorized = config.get('use_vectorized', False)\n",
+    "    num_envs = config.get('num_envs', 4)\n",
+    "    \n",
+    "    try:\n",
+    "        print(f\"\\n{'='*60}\")\n",
+    "        print(f\" 🚀 STARTING {agent_name} on GPU {gpu_id} (MAX GPU MODE)\")\n",
+    "        print(f\"    Reward Type: {reward_type}\")\n",
+    "        print(f\"    Vectorized: {use_vectorized} ({num_envs} envs)\" if use_vectorized else \"    Vectorized: False\")\n",
+    "        print(f\"{'='*60}\")\n",
+    "        \n",
+    "        # Set GPU for this thread\n",
+    "        if torch.cuda.is_available():\n",
+    "            torch.cuda.set_device(gpu_id)\n",
+    "            device_str = f\"cuda:{gpu_id}\"\n",
+    "            device = torch.device(device_str)\n",
+    "            \n",
+    "            # Create CUDA stream for this agent (async operations)\n",
+    "            compute_stream = torch.cuda.Stream(device=device)\n",
+    "            transfer_stream = torch.cuda.Stream(device=device)\n",
+    "        else:\n",
+    "            device_str = \"cpu\"\n",
+    "            device = torch.device(\"cpu\")\n",
+    "            compute_stream = None\n",
+    "            transfer_stream = None\n",
+    "        \n",
+    "        # Set random seeds (unique per agent)\n",
+    "        agent_num = int(agent_id.split('_')[-1]) if isinstance(agent_id, str) else agent_id\n",
+    "        seed = train_params['seed'] + agent_num\n",
+    "        np.random.seed(seed)\n",
+    "        torch.manual_seed(seed)\n",
+    "        if torch.cuda.is_available():\n",
+    "            torch.cuda.manual_seed(seed)\n",
+    "        \n",
+    "        # Create environment based on reward type\n",
+    "        if reward_type == 'v9_style':\n",
+    "            env_class = V9StyleEnv_cls\n",
+    "        elif reward_type == 'simple_return':\n",
+    "            env_class = SimpleReturnEnv_cls\n",
+    "        else:\n",
+    "            env_class = LogReturnEnv_cls\n",
+    "        \n",
+    "        # Create environments (GPU-cached data)\n",
+    "        if use_vectorized and VectorizedEnvWrapper_cls is not None:\n",
+    "            train_env = VectorizedEnvWrapper_cls(\n",
+    "                env_class, train_data, \n",
+    "                num_envs=num_envs, \n",
+    "                device=device_str,\n",
+    "                **env_params\n",
+    "            )\n",
+    "            # V9StyleEnv uses churning_penalty, others use inactivity_penalty\n",
+    "            if reward_type == 'v9_style':\n",
+    "                valid_env = env_class(valid_data, device=device_str,\n",
+    "                                      initial_balance=env_params['initial_balance'],\n",
+    "                                      base_episode_length=env_params['base_episode_length'],\n",
+    "                                      base_transaction_fee=env_params['base_transaction_fee'],\n",
+    "                                      churning_penalty=env_params.get('churning_penalty', 0.0001),\n",
+    "                                      domain_randomization=False)\n",
+    "            else:\n",
+    "                valid_env = env_class(valid_data, device=device_str,\n",
+    "                                      initial_balance=env_params['initial_balance'],\n",
+    "                                      base_episode_length=env_params['base_episode_length'],\n",
+    "                                      base_transaction_fee=env_params['base_transaction_fee'],\n",
+    "                                      inactivity_penalty=env_params.get('inactivity_penalty', 0.0005),\n",
+    "                                      domain_randomization=False)\n",
+    "            print(f\"[{agent_name}] ✓ Vectorized environment ({num_envs} parallel envs)\")\n",
+    "        else:\n",
+    "            if reward_type == 'v9_style':\n",
+    "                train_env = V9StyleEnv_cls(train_data, device=device_str, **env_params)\n",
+    "                valid_env = V9StyleEnv_cls(valid_data, device=device_str,\n",
+    "                                           initial_balance=env_params['initial_balance'],\n",
+    "                                           base_episode_length=env_params['base_episode_length'],\n",
+    "                                           base_transaction_fee=env_params['base_transaction_fee'],\n",
+    "                                           churning_penalty=env_params.get('churning_penalty', 0.0001),\n",
+    "                                           domain_randomization=False)\n",
+    "            elif reward_type == 'simple_return':\n",
+    "                train_env = SimpleReturnEnv_cls(train_data, device=device_str, **env_params)\n",
+    "                valid_env = SimpleReturnEnv_cls(valid_data, device=device_str,\n",
+    "                                             initial_balance=env_params['initial_balance'],\n",
+    "                                             base_episode_length=env_params['base_episode_length'],\n",
+    "                                             base_transaction_fee=env_params['base_transaction_fee'],\n",
+    "                                             inactivity_penalty=env_params['inactivity_penalty'],\n",
+    "                                             domain_randomization=False)\n",
+    "            else:\n",
+    "                train_env = LogReturnEnv_cls(train_data, device=device_str, **env_params)\n",
+    "                valid_env = LogReturnEnv_cls(valid_data, device=device_str,\n",
+    "                                          initial_balance=env_params['initial_balance'],\n",
+    "                                          base_episode_length=env_params['base_episode_length'],\n",
+    "                                          base_transaction_fee=env_params['base_transaction_fee'],\n",
+    "                                          slippage=env_params.get('slippage', 0.0005),\n",
+    "                                          inactivity_penalty=env_params['inactivity_penalty'],\n",
+    "                                          domain_randomization=False)\n",
+    "        \n",
+    "        state_dim = train_env.observation_space.shape[0]\n",
+    "        \n",
+    "        # Create agent (eager mode for thread-safety)\n",
+    "        agent = MultiAgentSAC_cls(\n",
+    "            agent_name=agent_name,\n",
+    "            state_dim=state_dim,\n",
+    "            action_dim=1,\n",
+    "            gpu_id=gpu_id,\n",
+    "            use_compile=False,  # DISABLED for thread-safety\n",
+    "            **agent_params\n",
+    "        )\n",
+    "        \n",
+    "        device = agent.device\n",
+    "        \n",
+    "        # Create replay buffer\n",
+    "        replay_buffer = PrioritizedReplayBuffer_cls(\n",
+    "            capacity=3000000,\n",
+    "            alpha=0.6,\n",
+    "            beta_start=0.4,\n",
+    "            beta_frames=750000\n",
+    "        )\n",
+    "        \n",
+    "        # Create async batch sampler\n",
+    "        async_sampler = AsyncBatchSampler_cls(\n",
+    "            replay_buffer, \n",
+    "            batch_size=agent.batch_size,\n",
+    "            device=device,\n",
+    "            queue_size=3\n",
+    "        )\n",
+    "        \n",
+    "        # Create checkpoint directory\n",
+    "        save_dir = f\"multi_agent_checkpoints/{agent_name}\"\n",
+    "        os.makedirs(save_dir, exist_ok=True)\n",
+    "        \n",
+    "        # Metrics\n",
+    "        episode_returns = []\n",
+    "        eval_returns = []\n",
+    "        best_eval_return = -np.inf\n",
+    "        \n",
+    "        # Pre-allocate GPU tensors for action conversion (avoid repeated allocation)\n",
+    "        action_buffer_gpu = torch.zeros(1, 1, device=device)\n",
+    "        \n",
+    "        start_time = time.time()\n",
+    "        \n",
+    "        # ================================================================\n",
+    "        # WARMUP PHASE\n",
+    "        # ================================================================\n",
+    "        print(f\"[{agent_name}] 🔥 Warmup: {train_params['warmup_steps']} steps...\")\n",
+    "        \n",
+    "        if use_vectorized:\n",
+    "            states = train_env.reset()\n",
+    "            steps_done = 0\n",
+    "            while steps_done < train_params['warmup_steps']:\n",
+    "                actions = np.random.uniform(-1, 1, size=(num_envs, 1))\n",
+    "                next_states, rewards, dones, _ = train_env.step(actions)\n",
+    "                replay_buffer.add_batch(states, actions, rewards, next_states, dones.astype(float))\n",
+    "                states = next_states\n",
+    "                steps_done += num_envs\n",
+    "        else:\n",
+    "            state = train_env.reset()\n",
+    "            for step in range(train_params['warmup_steps']):\n",
+    "                action = np.random.uniform(-1, 1, size=(1,))\n",
+    "                next_state, reward, done, _ = train_env.step(action)\n",
+    "                replay_buffer.add(state, action, reward, next_state, float(done))\n",
+    "                state = train_env.reset() if done else next_state\n",
+    "        \n",
+    "        print(f\"[{agent_name}] ✅ Buffer: {len(replay_buffer):,} transitions\")\n",
+    "        \n",
+    "        # Start async batch sampler\n",
+    "        async_sampler.start()\n",
+    "        \n",
+    "        # ================================================================\n",
+    "        # GPU MEMORY INFO\n",
+    "        # ================================================================\n",
+    "        if torch.cuda.is_available():\n",
+    "            allocated = torch.cuda.memory_allocated(gpu_id) / 1e9\n",
+    "            reserved = torch.cuda.memory_reserved(gpu_id) / 1e9\n",
+    "            print(f\"[{agent_name}] 📊 GPU Memory: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved\")\n",
+    "        \n",
+    "        # ================================================================\n",
+    "        # TRAINING LOOP - MAXIMUM GPU UTILIZATION\n",
+    "        # ================================================================\n",
+    "        for episode in range(1, train_params['num_episodes'] + 1):\n",
+    "            if use_vectorized:\n",
+    "                # Vectorized episode (already GPU-optimized)\n",
+    "                states = train_env.reset()\n",
+    "                episode_return = 0\n",
+    "                steps_in_episode = 0\n",
+    "                max_steps = train_env.envs[0].episode_length if hasattr(train_env.envs[0], 'episode_length') else 500\n",
+    "                \n",
+    "                while steps_in_episode < max_steps:\n",
+    "                    actions = agent.get_action_batch(states, deterministic=False)\n",
+    "                    next_states, rewards, dones, infos = train_env.step(actions)\n",
+    "                    replay_buffer.add_batch(states, actions, rewards, next_states, dones.astype(float))\n",
+    "                    \n",
+    "                    # Training update\n",
+    "                    if len(replay_buffer) >= agent.batch_size:\n",
+    "                        for _ in range(agent.gradient_steps):\n",
+    "                            batch = async_sampler.get_batch()\n",
+    "                            states_t, actions_t, rewards_t, next_states_t, dones_t, indices, weights_t = batch\n",
+    "                            \n",
+    "                            td_errors, c1_loss, c2_loss, a_loss, alpha_loss = agent.train_step(\n",
+    "                                states_t, actions_t, rewards_t, next_states_t, dones_t, weights_t\n",
+    "                            )\n",
+    "                            \n",
+    "                            async_sampler.update_priorities(indices, td_errors)\n",
+    "                    \n",
+    "                    episode_return += rewards.sum()\n",
+    "                    states = next_states\n",
+    "                    steps_in_episode += num_envs\n",
+    "                \n",
+    "                episode_returns.append(episode_return / num_envs)\n",
+    "            else:\n",
+    "                # Single environment episode - GPU NATIVE\n",
+    "                state = train_env.reset()\n",
+    "                episode_return = 0\n",
+    "                done = False\n",
+    "                \n",
+    "                # Convert initial state to GPU tensor\n",
+    "                if not isinstance(state, torch.Tensor):\n",
+    "                    state_gpu = torch.FloatTensor(state).to(device, non_blocking=True)\n",
+    "                else:\n",
+    "                    state_gpu = state.to(device, non_blocking=True) if state.device != device else state\n",
+    "                \n",
+    "                while not done:\n",
+    "                    # GPU-NATIVE ACTION SELECTION (stays on GPU)\n",
+    "                    with torch.no_grad():\n",
+    "                        if hasattr(agent, 'get_action_gpu'):\n",
+    "                            action_gpu = agent.get_action_gpu(state_gpu, deterministic=False)\n",
+    "                        else:\n",
+    "                            # Fallback: use standard method but keep tensor form\n",
+    "                            state_t = state_gpu.unsqueeze(0) if state_gpu.dim() == 1 else state_gpu\n",
+    "                            action_gpu, _ = agent.actor.sample(state_t)\n",
+    "                            action_gpu = action_gpu.squeeze(0)\n",
+    "                    \n",
+    "                    # Environment step (requires numpy, but minimize conversion time)\n",
+    "                    action_np = action_gpu.cpu().numpy().flatten()\n",
+    "                    next_state, reward, done, info = train_env.step(action_np)\n",
+    "                    \n",
+    "                    # Convert next_state to GPU immediately with non_blocking\n",
+    "                    if not isinstance(next_state, torch.Tensor):\n",
+    "                        next_state_gpu = torch.FloatTensor(next_state).to(device, non_blocking=True)\n",
+    "                    else:\n",
+    "                        next_state_gpu = next_state.to(device, non_blocking=True) if next_state.device != device else next_state\n",
+    "                    \n",
+    "                    # Add to replay buffer (uses CPU state for numpy compatibility)\n",
+    "                    if isinstance(state, torch.Tensor):\n",
+    "                        state_np = state.cpu().numpy() if state.device.type == 'cuda' else state.numpy()\n",
+    "                    else:\n",
+    "                        state_np = state\n",
+    "                    if isinstance(next_state, torch.Tensor):\n",
+    "                        next_state_np = next_state.cpu().numpy() if next_state.device.type == 'cuda' else next_state.numpy()\n",
+    "                    else:\n",
+    "                        next_state_np = next_state\n",
+    "                        \n",
+    "                    replay_buffer.add(state_np, action_np, reward, next_state_np, float(done))\n",
+    "                    \n",
+    "                    # Training update - ALL ON GPU\n",
+    "                    if len(replay_buffer) >= agent.batch_size:\n",
+    "                        for _ in range(agent.gradient_steps):\n",
+    "                            batch = async_sampler.get_batch()\n",
+    "                            states_t, actions_t, rewards_t, next_states_t, dones_t, indices, weights_t = batch\n",
+    "                            \n",
+    "                            # All training computation on GPU\n",
+    "                            td_errors, c1_loss, c2_loss, a_loss, alpha_loss = agent.train_step(\n",
+    "                                states_t, actions_t, rewards_t, next_states_t, dones_t, weights_t\n",
+    "                            )\n",
+    "                            \n",
+    "                            async_sampler.update_priorities(indices, td_errors)\n",
+    "                    \n",
+    "                    episode_return += reward\n",
+    "                    # Update state tensors for next iteration\n",
+    "                    state = next_state\n",
+    "                    state_gpu = next_state_gpu\n",
+    "                \n",
+    "                episode_returns.append(episode_return)\n",
+    "            \n",
+    "            # Evaluation\n",
+    "            if episode % train_params['eval_frequency'] == 0:\n",
+    "                eval_episode_returns = []\n",
+    "                \n",
+    "                agent.actor.eval()\n",
+    "                for _ in range(train_params['eval_episodes']):\n",
+    "                    eval_state = valid_env.reset()\n",
+    "                    eval_return = 0\n",
+    "                    eval_done = False\n",
+    "                    eval_trades = 0  # Track trades during eval\n",
+    "                    \n",
+    "                    # GPU-native evaluation\n",
+    "                    if not isinstance(eval_state, torch.Tensor):\n",
+    "                        eval_state_gpu = torch.FloatTensor(eval_state).to(device)\n",
+    "                    else:\n",
+    "                        eval_state_gpu = eval_state.to(device)\n",
+    "                    \n",
+    "                    while not eval_done:\n",
+    "                        with torch.no_grad():\n",
+    "                            if hasattr(agent, 'get_action_gpu'):\n",
+    "                                eval_action_gpu = agent.get_action_gpu(eval_state_gpu, deterministic=True)\n",
+    "                            else:\n",
+    "                                eval_state_t = eval_state_gpu.unsqueeze(0) if eval_state_gpu.dim() == 1 else eval_state_gpu\n",
+    "                                mean, _ = agent.actor(eval_state_t)\n",
+    "                                eval_action_gpu = torch.tanh(mean).squeeze(0)\n",
+    "                        \n",
+    "                        eval_action = eval_action_gpu.cpu().numpy().flatten()\n",
+    "                        eval_state, eval_reward, eval_done, info = valid_env.step(eval_action)\n",
+    "                        eval_return += eval_reward\n",
+    "                        \n",
+    "                        # Get trade count from env if available\n",
+    "                        if hasattr(valid_env, 'trade_count'):\n",
+    "                            eval_trades = valid_env.trade_count\n",
+    "                        elif 'trade_count' in info:\n",
+    "                            eval_trades = info['trade_count']\n",
+    "                        \n",
+    "                        if not isinstance(eval_state, torch.Tensor):\n",
+    "                            eval_state_gpu = torch.FloatTensor(eval_state).to(device)\n",
+    "                        else:\n",
+    "                            eval_state_gpu = eval_state.to(device)\n",
+    "                    \n",
+    "                    eval_episode_returns.append((eval_return, eval_trades))\n",
+    "                agent.actor.train()\n",
+    "                \n",
+    "                mean_eval = np.mean([r[0] for r in eval_episode_returns])\n",
+    "                mean_trades = np.mean([r[1] for r in eval_episode_returns])\n",
+    "                eval_returns.append(mean_eval)\n",
+    "                \n",
+    "                if mean_eval > best_eval_return:\n",
+    "                    best_eval_return = mean_eval\n",
+    "                    agent.save_weights(f\"{save_dir}/best_ep{episode}\")\n",
+    "                    print(f\"[{agent_name}] 🏆 NEW BEST! Ep {episode} | Eval: {mean_eval:.4f} | Trades: {mean_trades:.0f}\")\n",
+    "                \n",
+    "                elapsed = time.time() - start_time\n",
+    "                recent_train = np.mean(episode_returns[-10:])\n",
+    "                \n",
+    "                # Show GPU memory periodically\n",
+    "                if torch.cuda.is_available() and episode % (train_params['eval_frequency'] * 5) == 0:\n",
+    "                    gpu_mem = torch.cuda.memory_allocated(gpu_id) / 1e9\n",
+    "                    print(f\"[{agent_name}] Ep {episode:4d} | \"\n",
+    "                          f\"Train: {recent_train:7.4f} | \"\n",
+    "                          f\"Eval: {mean_eval:7.4f} | \"\n",
+    "                          f\"Trades: {mean_trades:3.0f} | \"\n",
+    "                          f\"Best: {best_eval_return:7.4f} | \"\n",
+    "                          f\"α: {agent.alpha:.4f} | \"\n",
+    "                          f\"GPU: {gpu_mem:.2f}GB | \"\n",
+    "                          f\"Time: {elapsed/60:5.1f}m\")\n",
+    "                else:\n",
+    "                    print(f\"[{agent_name}] Ep {episode:4d} | \"\n",
+    "                          f\"Train: {recent_train:7.4f} | \"\n",
+    "                          f\"Eval: {mean_eval:7.4f} | \"\n",
+    "                          f\"Trades: {mean_trades:3.0f} | \"\n",
+    "                          f\"Best: {best_eval_return:7.4f} | \"\n",
+    "                          f\"α: {agent.alpha:.4f} | \"\n",
+    "                          f\"Time: {elapsed/60:5.1f}m\")\n",
+    "        \n",
+    "        # Stop async sampler\n",
+    "        async_sampler.stop()\n",
+    "        \n",
+    "        # Final save\n",
+    "        agent.save_weights(f\"{save_dir}/final_ep{train_params['num_episodes']}\")\n",
+    "        \n",
+    "        total_time = time.time() - start_time\n",
+    "        \n",
+    "        result = {\n",
+    "            'agent_name': agent_name,\n",
+    "            'best_eval_return': best_eval_return,\n",
+    "            'final_train_return': np.mean(episode_returns[-100:]) if len(episode_returns) >= 100 else np.mean(episode_returns),\n",
+    "            'total_time_minutes': total_time / 60,\n",
+    "            'episode_returns': episode_returns,\n",
+    "            'eval_returns': eval_returns\n",
+    "        }\n",
+    "        \n",
+    "        print(f\"\\n[{agent_name}] 🎉 TRAINING COMPLETE!\")\n",
+    "        print(f\"   Best eval: {best_eval_return:.4f}\")\n",
+    "        print(f\"   Time: {total_time/60:.1f} min\")\n",
+    "        \n",
+    "        # Final GPU memory report\n",
+    "        if torch.cuda.is_available():\n",
+    "            gpu_mem = torch.cuda.memory_allocated(gpu_id) / 1e9\n",
+    "            print(f\"   GPU Memory Used: {gpu_mem:.2f}GB\")\n",
+    "        \n",
+    "        if result_queue is not None:\n",
+    "            result_queue.put(result)\n",
+    "        \n",
+    "        return result\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"\\n[{agent_name}] ❌ ERROR: {str(e)}\")\n",
+    "        import traceback\n",
+    "        traceback.print_exc()\n",
+    "        \n",
+    "        error_result = {\n",
+    "            'agent_name': agent_name,\n",
+    "            'best_eval_return': -np.inf,\n",
+    "            'final_train_return': -np.inf,\n",
+    "            'total_time_minutes': 0,\n",
+    "            'episode_returns': [],\n",
+    "            'eval_returns': [],\n",
+    "            'error': str(e)\n",
+    "        }\n",
+    "        \n",
+    "        if result_queue is not None:\n",
+    "            result_queue.put(error_result)\n",
+    "        \n",
+    "        return error_result\n",
+    "\n",
+    "\n",
+    "print(\"✅ train_single_agent function ready! (MAXIMUM GPU MODE)\")\n",
+    "print(\"\\n🚀 GPU Optimizations Active:\")\n",
+    "print(\"   ✓ TF32 matmul enabled (2-3x speedup on Ampere+)\")\n",
+    "print(\"   ✓ GPU-cached environment data\")\n",
+    "print(\"   ✓ Async replay buffer pre-sampling\")\n",
+    "print(\"   ✓ GPU-native action selection (minimal CPU roundtrips)\")\n",
+    "print(\"   ✓ Non-blocking GPU transfers\")\n",
+    "print(\"   ✓ Pre-allocated GPU tensors\")\n",
+    "print(\"   ✓ CUDA streams for async operations\")\n",
+    "print(\"\\n   Expected GPU utilization: 70-90%+\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 4.2: RUN PARALLEL TRAINING - NOTEBOOK COMPATIBLE\n",
+    "# Uses ThreadPoolExecutor for Jupyter notebook compatibility\n",
+    "# (Multiprocessing doesn't work in notebooks due to pickling issues)\n",
+    "# ============================================================================\n",
+    "\n",
+    "import threading\n",
+    "import queue\n",
+    "import time\n",
+    "import os\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" 🚀 PARALLEL TRAINING: NOTEBOOK COMPATIBLE (ThreadPool)\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# ============================================================================\n",
+    "# CHECK GPU AVAILABILITY\n",
+    "# ============================================================================\n",
+    "if torch.cuda.is_available():\n",
+    "    num_gpus = torch.cuda.device_count()\n",
+    "    print(f\"\\n✅ {num_gpus} GPU(s) available:\")\n",
+    "    for i in range(num_gpus):\n",
+    "        print(f\"   GPU {i}: {torch.cuda.get_device_name(i)}\")\n",
+    "        mem = torch.cuda.get_device_properties(i).total_memory / 1e9\n",
+    "        print(f\"          Memory: {mem:.2f} GB\")\n",
+    "else:\n",
+    "    print(\"❌ No GPU available! Training will use CPU.\")\n",
+    "\n",
+    "# ============================================================================\n",
+    "# PARALLEL TRAINING WITH THREADPOOL (NOTEBOOK COMPATIBLE)\n",
+    "# Uses ThreadPoolExecutor - works in Jupyter notebooks\n",
+    "# GPU operations release GIL, so threads still provide good parallelism\n",
+    "# ============================================================================\n",
+    "\n",
+    "def run_parallel_training(use_vectorized=False, num_envs=4):\n",
+    "    \"\"\"\n",
+    "    Run all 4 agents in parallel using ThreadPoolExecutor.\n",
+    "    \n",
+    "    NOTE: We use threading instead of multiprocessing because:\n",
+    "    - Multiprocessing with 'spawn' can't pickle notebook functions\n",
+    "    - GPU operations release the GIL anyway, so threads work well\n",
+    "    - ThreadPoolExecutor is more reliable in Jupyter notebooks\n",
+    "    \n",
+    "    The other 4 tiers of optimization still provide massive speedups:\n",
+    "    - TIER 2: torch.compile (40-70% speedup)\n",
+    "    - TIER 3: GPU-cached environments (5-10x speedup)\n",
+    "    - TIER 4: Vectorized environments (2-4x speedup)\n",
+    "    - TIER 5: Async batch sampling (10-20% speedup)\n",
+    "    \n",
+    "    Args:\n",
+    "        use_vectorized: Enable TIER 4 vectorized environments\n",
+    "        num_envs: Number of parallel environments per agent (if vectorized)\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    print(\"\\n\" + \"=\"*70)\n",
+    "    print(\" STARTING PARALLEL TRAINING (ThreadPoolExecutor)\")\n",
+    "    print(\"=\"*70)\n",
+    "    \n",
+    "    # Build classes dictionary\n",
+    "    classes_dict = {\n",
+    "        'SimpleReturnEnv': SimpleReturnEnv,\n",
+    "        'LogReturnEnv': LogReturnEnv,\n",
+    "        'V9StyleEnv': V9StyleEnv,  # NEW: Version 9 style reward\n",
+    "        'MultiAgentSAC': MultiAgentSAC,\n",
+    "        'PrioritizedReplayBuffer': PrioritizedReplayBuffer,\n",
+    "        'AsyncBatchSampler': AsyncBatchSampler,\n",
+    "        'VectorizedEnvWrapper': VectorizedEnvWrapper,\n",
+    "    }\n",
+    "    print(\"✅ Class references collected\")\n",
+    "    \n",
+    "    # Update configs with vectorization settings\n",
+    "    for agent_id, config in AGENT_CONFIGS.items():\n",
+    "        config['use_vectorized'] = use_vectorized\n",
+    "        config['num_envs'] = num_envs\n",
+    "    \n",
+    "    # Create checkpoint directory\n",
+    "    os.makedirs(\"multi_agent_checkpoints\", exist_ok=True)\n",
+    "    \n",
+    "    # Prepare agent configs list\n",
+    "    agent_configs = list(AGENT_CONFIGS.items())\n",
+    "    \n",
+    "    print(f\"\\n🚀 Starting {len(agent_configs)} training threads...\")\n",
+    "    for agent_id, config in agent_configs:\n",
+    "        print(f\"   • {config['name']} → GPU {config['gpu_id']}\")\n",
+    "    \n",
+    "    start_time = time.time()\n",
+    "    results = []\n",
+    "    \n",
+    "    # Use ThreadPoolExecutor for parallel training\n",
+    "    # max_workers = number of agents (4)\n",
+    "    with ThreadPoolExecutor(max_workers=len(agent_configs)) as executor:\n",
+    "        # Submit all training jobs\n",
+    "        futures = {}\n",
+    "        for agent_id, config in agent_configs:\n",
+    "            future = executor.submit(\n",
+    "                train_single_agent,\n",
+    "                agent_id, config, train_data, valid_data, classes_dict, None\n",
+    "            )\n",
+    "            futures[future] = config['name']\n",
+    "            print(f\"   ✅ {config['name']} submitted\")\n",
+    "            time.sleep(0.5)  # Stagger to avoid CUDA init race\n",
+    "        \n",
+    "        print(\"\\n⏳ Training in progress...\")\n",
+    "        print(\"   (GPU operations release GIL - threads run in parallel)\\n\")\n",
+    "        \n",
+    "        # Collect results as they complete\n",
+    "        for future in as_completed(futures):\n",
+    "            agent_name = futures[future]\n",
+    "            try:\n",
+    "                result = future.result()\n",
+    "                results.append(result)\n",
+    "                if 'error' not in result:\n",
+    "                    print(f\"   ✅ {agent_name} completed: Best eval = {result['best_eval_return']:.4f}\")\n",
+    "                else:\n",
+    "                    print(f\"   ❌ {agent_name} failed: {result['error']}\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"   ❌ {agent_name} exception: {e}\")\n",
+    "                results.append({\n",
+    "                    'agent_name': agent_name,\n",
+    "                    'best_eval_return': -float('inf'),\n",
+    "                    'error': str(e)\n",
+    "                })\n",
+    "    \n",
+    "    total_time = time.time() - start_time\n",
+    "    \n",
+    "    # Print summary\n",
+    "    print(\"\\n\" + \"=\"*70)\n",
+    "    print(\" 🎉 ALL AGENTS TRAINING COMPLETE!\")\n",
+    "    print(\"=\"*70)\n",
+    "    print(f\"\\n⏱️  Total training time: {total_time/60:.1f} minutes ({total_time/3600:.2f} hours)\")\n",
+    "    \n",
+    "    # Check for errors\n",
+    "    successful_results = [r for r in results if 'error' not in r]\n",
+    "    failed_results = [r for r in results if 'error' in r]\n",
+    "    \n",
+    "    if failed_results:\n",
+    "        print(f\"\\n⚠️  {len(failed_results)} agent(s) failed:\")\n",
+    "        for r in failed_results:\n",
+    "            print(f\"   ❌ {r['agent_name']}: {r.get('error', 'Unknown error')}\")\n",
+    "    \n",
+    "    if successful_results:\n",
+    "        print(\"\\n📊 RESULTS SUMMARY:\")\n",
+    "        print(\"-\"*70)\n",
+    "        print(f\"{'Agent':<25} {'Best Eval':>12} {'Final Train':>12} {'Time (min)':>10}\")\n",
+    "        print(\"-\"*70)\n",
+    "        \n",
+    "        for r in sorted(successful_results, key=lambda x: x['best_eval_return'], reverse=True):\n",
+    "            print(f\"{r['agent_name']:<25} {r['best_eval_return']:>12.4f} {r['final_train_return']:>12.4f} {r['total_time_minutes']:>10.1f}\")\n",
+    "        \n",
+    "        print(\"-\"*70)\n",
+    "        \n",
+    "        best_agent = max(successful_results, key=lambda x: x['best_eval_return'])\n",
+    "        print(f\"\\n🏆 BEST AGENT: {best_agent['agent_name']}\")\n",
+    "        print(f\"   Best Eval Return: {best_agent['best_eval_return']:.4f}\")\n",
+    "    \n",
+    "    print(\"\\n📁 Checkpoints saved to: multi_agent_checkpoints/\")\n",
+    "    print(\"\\n🚀 HYBRID GPU Optimizations used:\")\n",
+    "    print(\"   ✓ TF32 matmul (2-3x speedup on Ampere+)\")\n",
+    "    print(\"   ✓ TIER 3: GPU-cached environments (5-10x speedup)\")\n",
+    "    if use_vectorized:\n",
+    "        print(f\"   ✓ TIER 4: Vectorized envs ({num_envs} per agent)\")\n",
+    "    print(\"   ✓ TIER 5: Async batch sampling (10-20% speedup)\")\n",
+    "    print(\"   ✓ Eager mode for thread-safety\")\n",
+    "    print(\"   ℹ️  Threading used (GPU ops release GIL for parallelism)\")\n",
+    "    print(\"=\"*70)\n",
+    "    \n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "# ============================================================================\n",
+    "# SEQUENTIAL TRAINING (for debugging or single-agent)\n",
+    "# ============================================================================\n",
+    "\n",
+    "def run_sequential_training(agent_ids=None):\n",
+    "    \"\"\"\n",
+    "    Run agents sequentially (one at a time).\n",
+    "    Useful for debugging or when parallel training has issues.\n",
+    "    \n",
+    "    Args:\n",
+    "        agent_ids: List of agent IDs to train, or None for all agents\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    print(\"\\n\" + \"=\"*70)\n",
+    "    print(\" STARTING SEQUENTIAL TRAINING\")\n",
+    "    print(\"=\"*70)\n",
+    "    \n",
+    "    classes_dict = {\n",
+    "        'SimpleReturnEnv': SimpleReturnEnv,\n",
+    "        'LogReturnEnv': LogReturnEnv,\n",
+    "        'V9StyleEnv': V9StyleEnv,  # NEW: Version 9 style reward\n",
+    "        'MultiAgentSAC': MultiAgentSAC,\n",
+    "        'PrioritizedReplayBuffer': PrioritizedReplayBuffer,\n",
+    "        'AsyncBatchSampler': AsyncBatchSampler,\n",
+    "        'VectorizedEnvWrapper': VectorizedEnvWrapper,\n",
+    "    }\n",
+    "    \n",
+    "    os.makedirs(\"multi_agent_checkpoints\", exist_ok=True)\n",
+    "    \n",
+    "    if agent_ids is None:\n",
+    "        agent_ids = list(AGENT_CONFIGS.keys())\n",
+    "    \n",
+    "    results = []\n",
+    "    start_time = time.time()\n",
+    "    \n",
+    "    for agent_id in agent_ids:\n",
+    "        if agent_id not in AGENT_CONFIGS:\n",
+    "            print(f\"⚠️ Unknown agent: {agent_id}\")\n",
+    "            continue\n",
+    "        \n",
+    "        config = AGENT_CONFIGS[agent_id]\n",
+    "        print(f\"\\n🚀 Training {config['name']}...\")\n",
+    "        \n",
+    "        result = train_single_agent(agent_id, config, train_data, valid_data, classes_dict, None)\n",
+    "        results.append(result)\n",
+    "    \n",
+    "    total_time = time.time() - start_time\n",
+    "    print(f\"\\n⏱️  Total time: {total_time/60:.1f} minutes\")\n",
+    "    \n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "# ============================================================================\n",
+    "# CONFIGURATION SUMMARY\n",
+    "# ============================================================================\n",
+    "\n",
+    "print(\"\\n⚠️  READY TO START PARALLEL TRAINING!\")\n",
+    "print(\"\\n📋 Agent Configuration:\")\n",
+    "for agent_id, config in AGENT_CONFIGS.items():\n",
+    "    print(f\"   {config['name']}: GPU {config['gpu_id']}, {config['reward_type']}\")\n",
+    "\n",
+    "print(\"\\n💡 To start training:\")\n",
+    "print(\"   • Parallel (recommended): results = run_parallel_training()\")\n",
+    "print(\"   • With vectorization:     results = run_parallel_training(use_vectorized=True, num_envs=4)\")\n",
+    "print(\"   • Sequential (debug):     results = run_sequential_training()\")\n",
+    "print(\"   • Single agent:           results = run_sequential_training(['agent_1'])\")\n",
+    "print(\"\\n🚀 HYBRID GPU Optimizations (Thread-Safe):\")\n",
+    "print(\"   ✓ TF32 matmul (2-3x speedup on Ampere+)\")\n",
+    "print(\"   ✓ TIER 3: GPU environments (5-10x speedup)\")\n",
+    "print(\"   ✓ TIER 4: Vectorized envs (2-4x speedup) - optional\")\n",
+    "print(\"   ✓ TIER 5: Async sampling (10-20% speedup)\")\n",
+    "print(\"   ✓ Eager mode (thread-safe, no torch.compile)\")\n",
+    "print(\"\\n   ℹ️  Using ThreadPoolExecutor for parallel training\")\n",
+    "print(\"       (GPU operations release GIL for parallelism)\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL: START TRAINING (RUN THIS CELL)\n",
+    "# ============================================================================\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" 🚀 STARTING GPU OPTIMIZED TRAINING\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "# Run parallel training with VECTORIZED environments (8 envs per agent)\n",
+    "# This massively increases GPU utilization by batching environment steps\n",
+    "results = run_parallel_training(use_vectorized=True, num_envs=8)\n",
+    "\n",
+    "# After training, compare the agents:\n",
+    "# compare_agents(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================================\n",
+    "# CELL 5.1: MULTI-AGENT COMPARISON & VISUALIZATION\n",
+    "# ============================================================================\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "print(\"=\"*70)\n",
+    "print(\" 📊 MULTI-AGENT COMPARISON & VISUALIZATION\")\n",
+    "print(\"=\"*70)\n",
+    "\n",
+    "def compare_agents(results):\n",
+    "    \"\"\"\n",
+    "    Visualize and compare all trained agents\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    if results is None or len(results) == 0:\n",
+    "        print(\"❌ No results to compare! Run training first.\")\n",
+    "        return\n",
+    "    \n",
+    "    fig = plt.figure(figsize=(16, 12))\n",
+    "    \n",
+    "    # ================================================================\n",
+    "    # PLOT 1: Evaluation Returns Over Training\n",
+    "    # ================================================================\n",
+    "    ax1 = plt.subplot(2, 2, 1)\n",
+    "    \n",
+    "    colors = ['blue', 'green', 'red', 'orange']\n",
+    "    for i, r in enumerate(results):\n",
+    "        eval_returns = r.get('eval_returns', [])\n",
+    "        if len(eval_returns) > 0:\n",
+    "            episodes = np.arange(10, 10 * len(eval_returns) + 1, 10)\n",
+    "            plt.plot(episodes, eval_returns, color=colors[i % len(colors)], \n",
+    "                    label=r['agent_name'], linewidth=2, alpha=0.8)\n",
+    "    \n",
+    "    plt.title('Evaluation Returns During Training', fontsize=12, weight='bold')\n",
+    "    plt.xlabel('Episode')\n",
+    "    plt.ylabel('Eval Return')\n",
+    "    plt.legend()\n",
+    "    plt.grid(alpha=0.3)\n",
+    "    \n",
+    "    # ================================================================\n",
+    "    # PLOT 2: Best Eval Return Comparison (Bar Chart)\n",
+    "    # ================================================================\n",
+    "    ax2 = plt.subplot(2, 2, 2)\n",
+    "    \n",
+    "    agent_names = [r['agent_name'] for r in results]\n",
+    "    best_returns = [r['best_eval_return'] for r in results]\n",
+    "    \n",
+    "    bars = plt.bar(agent_names, best_returns, color=colors[:len(results)], alpha=0.8)\n",
+    "    \n",
+    "    # Add value labels on bars\n",
+    "    for bar, val in zip(bars, best_returns):\n",
+    "        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,\n",
+    "                f'{val:.4f}', ha='center', va='bottom', fontsize=10)\n",
+    "    \n",
+    "    plt.title('Best Evaluation Return by Agent', fontsize=12, weight='bold')\n",
+    "    plt.ylabel('Best Eval Return')\n",
+    "    plt.xticks(rotation=15)\n",
+    "    plt.grid(alpha=0.3, axis='y')\n",
+    "    \n",
+    "    # ================================================================\n",
+    "    # PLOT 3: Training Time Comparison\n",
+    "    # ================================================================\n",
+    "    ax3 = plt.subplot(2, 2, 3)\n",
+    "    \n",
+    "    times = [r['total_time_minutes'] for r in results]\n",
+    "    \n",
+    "    bars = plt.bar(agent_names, times, color=colors[:len(results)], alpha=0.8)\n",
+    "    \n",
+    "    for bar, val in zip(bars, times):\n",
+    "        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,\n",
+    "                f'{val:.1f}m', ha='center', va='bottom', fontsize=10)\n",
+    "    \n",
+    "    plt.title('Training Time by Agent', fontsize=12, weight='bold')\n",
+    "    plt.ylabel('Time (minutes)')\n",
+    "    plt.xticks(rotation=15)\n",
+    "    plt.grid(alpha=0.3, axis='y')\n",
+    "    \n",
+    "    # ================================================================\n",
+    "    # PLOT 4: Episode Returns Distribution\n",
+    "    # ================================================================\n",
+    "    ax4 = plt.subplot(2, 2, 4)\n",
+    "    \n",
+    "    for i, r in enumerate(results):\n",
+    "        episode_returns = r.get('episode_returns', [])\n",
+    "        if len(episode_returns) > 0:\n",
+    "            # Moving average\n",
+    "            window = 50\n",
+    "            if len(episode_returns) > window:\n",
+    "                ma = np.convolve(episode_returns, np.ones(window)/window, mode='valid')\n",
+    "                plt.plot(ma, color=colors[i % len(colors)], \n",
+    "                        label=r['agent_name'], linewidth=1.5, alpha=0.8)\n",
+    "    \n",
+    "    plt.title('Training Returns (50-episode moving average)', fontsize=12, weight='bold')\n",
+    "    plt.xlabel('Episode')\n",
+    "    plt.ylabel('Episode Return')\n",
+    "    plt.legend()\n",
+    "    plt.grid(alpha=0.3)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "    plt.savefig('multi_agent_comparison.png', dpi=150, bbox_inches='tight')\n",
+    "    plt.show()\n",
+    "    \n",
+    "    # ================================================================\n",
+    "    # PRINT DETAILED COMPARISON\n",
+    "    # ================================================================\n",
+    "    print(\"\\n\" + \"=\"*70)\n",
+    "    print(\" 📊 DETAILED COMPARISON\")\n",
+    "    print(\"=\"*70)\n",
+    "    \n",
+    "    print(f\"\\n{'Agent':<25} {'Reward Type':<15} {'GPU':>5} {'Best Eval':>12} {'Final Train':>12}\")\n",
+    "    print(\"-\"*70)\n",
+    "    \n",
+    "    for r in sorted(results, key=lambda x: x['best_eval_return'], reverse=True):\n",
+    "        # Get config info\n",
+    "        agent_key = [k for k, v in AGENT_CONFIGS.items() if v['name'] == r['agent_name']][0]\n",
+    "        config = AGENT_CONFIGS[agent_key]\n",
+    "        \n",
+    "        print(f\"{r['agent_name']:<25} {config['reward_type']:<15} {config['gpu_id']:>5} \"\n",
+    "              f\"{r['best_eval_return']:>12.4f} {r['final_train_return']:>12.4f}\")\n",
+    "    \n",
+    "    print(\"-\"*70)\n",
+    "    \n",
+    "    # Winner analysis\n",
+    "    best = max(results, key=lambda x: x['best_eval_return'])\n",
+    "    worst = min(results, key=lambda x: x['best_eval_return'])\n",
+    "    \n",
+    "    print(f\"\\n🏆 WINNER: {best['agent_name']} (Eval: {best['best_eval_return']:.4f})\")\n",
+    "    print(f\"   Improvement over worst: {((best['best_eval_return'] - worst['best_eval_return']) / abs(worst['best_eval_return']) * 100):.1f}%\")\n",
+    "    \n",
+    "    print(\"\\n📁 Comparison plot saved to: multi_agent_comparison.png\")\n",
+    "\n",
+    "\n",
+    "# Run comparison if results exist\n",
+    "print(\"\\n💡 To compare agents, run: compare_agents(results)\")\n",
+    "print(\"   (After training is complete)\")\n",
+    "print(\"=\"*70)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [
+    {
+     "databundleVersionId": 11857421,
+     "datasetId": 7097204,
+     "sourceId": 11420269,
+     "sourceType": "datasetVersion"
+    },
+    {
+     "databundleVersionId": 14713599,
+     "datasetId": 7608804,
+     "sourceId": 13942443,
+     "sourceType": "datasetVersion"
+    },
+    {
+     "databundleVersionId": 14683978,
+     "datasetId": 8867139,
+     "sourceId": 13916129,
+     "sourceType": "datasetVersion"
+    },
+    {
+     "databundleVersionId": 14709093,
+     "datasetId": 5656419,
+     "sourceId": 13938349,
+     "sourceType": "datasetVersion"
+    },
+    {
+     "databundleVersionId": 14661984,
+     "datasetId": 8853352,
+     "sourceId": 13896214,
+     "sourceType": "datasetVersion"
+    },
+    {
+     "databundleVersionId": 14217399,
+     "datasetId": 8569093,
+     "sourceId": 13496378,
+     "sourceType": "datasetVersion"
+    },
+    {
+     "databundleVersionId": 14226416,
+     "datasetId": 8574199,
+     "sourceId": 13504593,
+     "sourceType": "datasetVersion"
+    }
+   ],
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}