# %% # ============================================================================ # CELL 1: PYTORCH GPU SETUP (KAGGLE 30GB GPU) # ============================================================================ !pip install -q ta import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') print("="*70) print(" PYTORCH GPU SETUP (30GB GPU)") print("="*70) # ============================================================================ # GPU CONFIGURATION FOR MAXIMUM PERFORMANCE # ============================================================================ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.is_available(): # Get GPU info gpu_name = torch.cuda.get_device_name(0) gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9 print(f"✅ GPU: {gpu_name}") print(f"✅ GPU Memory: {gpu_mem:.1f} GB") # Enable TF32 for faster matmul (Ampere GPUs: A100, RTX 30xx, 40xx) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True print("✅ TF32: Enabled (2-3x speedup on Ampere)") # Enable cuDNN autotuner torch.backends.cudnn.benchmark = True print("✅ cuDNN benchmark: Enabled") # Set default tensor type to CUDA torch.set_default_device('cuda') print("✅ Default device: CUDA") else: print("⚠️ No GPU detected, using CPU") print(f"\n✅ PyTorch: {torch.__version__}") print(f"✅ Device: {device}") print("="*70) # %% # ============================================================================ # CELL 2: LOAD DATA + FEATURES + TRAIN/VALID/TEST SPLIT # ============================================================================ import numpy as np import pandas as pd import gym from gym import spaces from sklearn.preprocessing import StandardScaler from ta.momentum import RSIIndicator, StochasticOscillator, ROCIndicator, WilliamsRIndicator from ta.trend import MACD, EMAIndicator, SMAIndicator, ADXIndicator, CCIIndicator from ta.volatility import BollingerBands, AverageTrueRange from ta.volume import OnBalanceVolumeIndicator import os print("="*70) print(" LOADING DATA + FEATURES") print("="*70) # ============================================================================ # 1. LOAD BITCOIN DATA # ============================================================================ data_path = '/kaggle/input/bitcoin-historical-datasets-2018-2024/' btc_data = pd.read_csv(data_path + 'btc_15m_data_2018_to_2025.csv') column_mapping = {'Open time': 'timestamp', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'} btc_data = btc_data.rename(columns=column_mapping) btc_data['timestamp'] = pd.to_datetime(btc_data['timestamp']) btc_data.set_index('timestamp', inplace=True) btc_data = btc_data[['open', 'high', 'low', 'close', 'volume']] for col in btc_data.columns: btc_data[col] = pd.to_numeric(btc_data[col], errors='coerce') btc_data = btc_data[btc_data.index >= '2021-01-01'] btc_data = btc_data[~btc_data.index.duplicated(keep='first')] btc_data = btc_data.replace(0, np.nan).dropna().sort_index() print(f"✅ BTC Data: {len(btc_data):,} candles") # ============================================================================ # 2. LOAD FEAR & GREED INDEX # ============================================================================ fgi_loaded = False try: fgi_path = '/kaggle/input/btc-usdt-4h-ohlc-fgi-daily-2020/' files = os.listdir(fgi_path) for filename in files: if filename.endswith('.csv'): fgi_data = pd.read_csv(fgi_path + filename) # Find timestamp column time_col = [c for c in fgi_data.columns if 'time' in c.lower() or 'date' in c.lower()] if time_col: fgi_data['timestamp'] = pd.to_datetime(fgi_data[time_col[0]]) else: fgi_data['timestamp'] = pd.to_datetime(fgi_data.iloc[:, 0]) fgi_data.set_index('timestamp', inplace=True) # Find FGI column fgi_col = [c for c in fgi_data.columns if 'fgi' in c.lower() or 'fear' in c.lower() or 'greed' in c.lower()] if fgi_col: fgi_data = fgi_data[[fgi_col[0]]].rename(columns={fgi_col[0]: 'fgi'}) fgi_loaded = True print(f"✅ Fear & Greed loaded: {len(fgi_data):,} values") break except: pass if not fgi_loaded: fgi_data = pd.DataFrame(index=btc_data.index) fgi_data['fgi'] = 50 print("⚠️ Using neutral FGI values") # Merge FGI btc_data = btc_data.join(fgi_data, how='left') btc_data['fgi'] = btc_data['fgi'].fillna(method='ffill').fillna(method='bfill').fillna(50) # ============================================================================ # 3. TECHNICAL INDICATORS # ============================================================================ print("🔧 Calculating indicators...") data = btc_data.copy() # Momentum data['rsi_14'] = RSIIndicator(close=data['close'], window=14).rsi() / 100 data['rsi_7'] = RSIIndicator(close=data['close'], window=7).rsi() / 100 stoch = StochasticOscillator(high=data['high'], low=data['low'], close=data['close'], window=14) data['stoch_k'] = stoch.stoch() / 100 data['stoch_d'] = stoch.stoch_signal() / 100 roc = ROCIndicator(close=data['close'], window=12) data['roc_12'] = np.tanh(roc.roc() / 100) williams = WilliamsRIndicator(high=data['high'], low=data['low'], close=data['close'], lbp=14) data['williams_r'] = (williams.williams_r() + 100) / 100 macd = MACD(close=data['close']) data['macd'] = np.tanh(macd.macd() / data['close'] * 100) data['macd_signal'] = np.tanh(macd.macd_signal() / data['close'] * 100) data['macd_diff'] = np.tanh(macd.macd_diff() / data['close'] * 100) # Trend data['sma_20'] = SMAIndicator(close=data['close'], window=20).sma_indicator() data['sma_50'] = SMAIndicator(close=data['close'], window=50).sma_indicator() data['ema_12'] = EMAIndicator(close=data['close'], window=12).ema_indicator() data['ema_26'] = EMAIndicator(close=data['close'], window=26).ema_indicator() data['price_vs_sma20'] = (data['close'] - data['sma_20']) / data['sma_20'] data['price_vs_sma50'] = (data['close'] - data['sma_50']) / data['sma_50'] adx = ADXIndicator(high=data['high'], low=data['low'], close=data['close'], window=14) data['adx'] = adx.adx() / 100 data['adx_pos'] = adx.adx_pos() / 100 data['adx_neg'] = adx.adx_neg() / 100 cci = CCIIndicator(high=data['high'], low=data['low'], close=data['close'], window=20) data['cci'] = np.tanh(cci.cci() / 100) # Volatility bb = BollingerBands(close=data['close'], window=20, window_dev=2) data['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / bb.bollinger_mavg() data['bb_position'] = (data['close'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband()) atr = AverageTrueRange(high=data['high'], low=data['low'], close=data['close'], window=14) data['atr_percent'] = atr.average_true_range() / data['close'] # Volume data['volume_ma_20'] = data['volume'].rolling(20).mean() data['volume_ratio'] = data['volume'] / (data['volume_ma_20'] + 1e-8) obv = OnBalanceVolumeIndicator(close=data['close'], volume=data['volume']) data['obv_slope'] = (obv.on_balance_volume().diff(5) / (obv.on_balance_volume().shift(5).abs() + 1e-8)) # Price action data['returns_1'] = data['close'].pct_change() data['returns_5'] = data['close'].pct_change(5) data['returns_20'] = data['close'].pct_change(20) data['volatility_20'] = data['returns_1'].rolling(20).std() data['body_size'] = abs(data['close'] - data['open']) / (data['open'] + 1e-8) data['high_20'] = data['high'].rolling(20).max() data['low_20'] = data['low'].rolling(20).min() data['price_position'] = (data['close'] - data['low_20']) / (data['high_20'] - data['low_20'] + 1e-8) # Fear & Greed data['fgi_normalized'] = (data['fgi'] - 50) / 50 data['fgi_change'] = data['fgi'].diff() / 50 data['fgi_ma7'] = data['fgi'].rolling(7).mean() data['fgi_vs_ma'] = (data['fgi'] - data['fgi_ma7']) / 50 # Time data['hour'] = data.index.hour / 24 data['day_of_week'] = data.index.dayofweek / 7 data['us_session'] = ((data.index.hour >= 14) & (data.index.hour < 21)).astype(float) btc_features = data.dropna() feature_cols = [col for col in btc_features.columns if col not in ['open', 'high', 'low', 'close', 'volume']] print(f"✅ Features: {len(feature_cols)}") # ============================================================================ # 4. TRAIN / VALID / TEST SPLIT (70/15/15) # ============================================================================ train_size = int(len(btc_features) * 0.70) valid_size = int(len(btc_features) * 0.15) train_data = btc_features.iloc[:train_size].copy() valid_data = btc_features.iloc[train_size:train_size+valid_size].copy() test_data = btc_features.iloc[train_size+valid_size:].copy() print(f"\n📊 Train: {len(train_data):,} | Valid: {len(valid_data):,} | Test: {len(test_data):,}") # ============================================================================ # 5. TRADING ENVIRONMENT (WITH ANTI-SHORT BIAS) # ============================================================================ class BitcoinTradingEnv(gym.Env): def __init__(self, df, initial_balance=10000, episode_length=500, transaction_fee=0.0, long_bonus=0.0001, short_penalty_threshold=0.8, short_penalty=0.05): super().__init__() self.df = df.reset_index(drop=True) self.initial_balance = initial_balance self.episode_length = episode_length self.transaction_fee = transaction_fee # Anti-short bias parameters self.long_bonus = long_bonus # Small bonus for being long self.short_penalty_threshold = short_penalty_threshold # If >80% short, penalize self.short_penalty = short_penalty # Penalty amount at episode end self.feature_cols = [col for col in df.columns if col not in ['open', 'high', 'low', 'close', 'volume']] self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32) self.observation_space = spaces.Box( low=-10, high=10, shape=(len(self.feature_cols) + 5,), dtype=np.float32 ) self.reset() def reset(self): max_start = len(self.df) - self.episode_length - 1 self.start_idx = np.random.randint(100, max(101, max_start)) self.current_step = 0 self.balance = self.initial_balance self.position = 0.0 self.entry_price = 0.0 self.total_value = self.initial_balance self.prev_total_value = self.initial_balance self.max_value = self.initial_balance # Track position history for bias detection self.long_steps = 0 self.short_steps = 0 self.neutral_steps = 0 return self._get_obs() def _get_obs(self): idx = self.start_idx + self.current_step features = self.df.loc[idx, self.feature_cols].values total_return = (self.total_value / self.initial_balance) - 1 drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0 portfolio_info = np.array([ self.position, total_return, drawdown, self.df.loc[idx, 'returns_1'], self.df.loc[idx, 'rsi_14'] ], dtype=np.float32) obs = np.concatenate([features, portfolio_info]) return np.clip(obs, -10, 10).astype(np.float32) def step(self, action): idx = self.start_idx + self.current_step current_price = self.df.loc[idx, 'close'] target_position = np.clip(action[0], -1.0, 1.0) self.prev_total_value = self.total_value if abs(target_position - self.position) > 0.1: if self.position != 0: self._close_position(current_price) if abs(target_position) > 0.1: self._open_position(target_position, current_price) self._update_total_value(current_price) self.max_value = max(self.max_value, self.total_value) # Track position type if self.position > 0.1: self.long_steps += 1 elif self.position < -0.1: self.short_steps += 1 else: self.neutral_steps += 1 self.current_step += 1 done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5) # ============ REWARD SHAPING ============ # Base reward: portfolio value change reward = (self.total_value - self.prev_total_value) / self.initial_balance # Small bonus for being LONG (encourages buying) if self.position > 0.1: reward += self.long_bonus # End-of-episode penalty for excessive shorting if done: total_active_steps = self.long_steps + self.short_steps if total_active_steps > 0: short_ratio = self.short_steps / total_active_steps if short_ratio > self.short_penalty_threshold: # Penalize heavily for being >80% short reward -= self.short_penalty * (short_ratio - self.short_penalty_threshold) / (1 - self.short_penalty_threshold) obs = self._get_obs() info = { 'total_value': self.total_value, 'position': self.position, 'long_steps': self.long_steps, 'short_steps': self.short_steps, 'neutral_steps': self.neutral_steps } return obs, reward, done, info def _update_total_value(self, current_price): if self.position != 0: if self.position > 0: pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1) else: pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price) self.total_value = self.balance + pnl else: self.total_value = self.balance def _open_position(self, size, price): self.position = size self.entry_price = price def _close_position(self, price): if self.position > 0: pnl = self.position * self.initial_balance * (price / self.entry_price - 1) else: pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price) pnl -= abs(pnl) * self.transaction_fee self.balance += pnl self.position = 0.0 print("✅ Environment class ready (with anti-short bias)") print("="*70) # %% # ============================================================================ # CELL 3: LOAD SENTIMENT DATA # ============================================================================ print("="*70) print(" LOADING SENTIMENT DATA") print("="*70) sentiment_file = '/kaggle/input/bitcoin-news-with-sentimen/bitcoin_news_3hour_intervals_with_sentiment.csv' try: sentiment_raw = pd.read_csv(sentiment_file) def parse_time_range(time_str): parts = str(time_str).split(' ') if len(parts) >= 2: date = parts[0] time_range = parts[1] start_time = time_range.split('-')[0] return f"{date} {start_time}:00" return time_str sentiment_raw['timestamp'] = sentiment_raw['time_interval'].apply(parse_time_range) sentiment_raw['timestamp'] = pd.to_datetime(sentiment_raw['timestamp']) sentiment_raw = sentiment_raw.set_index('timestamp').sort_index() sentiment_clean = pd.DataFrame(index=sentiment_raw.index) sentiment_clean['prob_bullish'] = pd.to_numeric(sentiment_raw['prob_bullish'], errors='coerce') sentiment_clean['prob_bearish'] = pd.to_numeric(sentiment_raw['prob_bearish'], errors='coerce') sentiment_clean['prob_neutral'] = pd.to_numeric(sentiment_raw['prob_neutral'], errors='coerce') sentiment_clean['confidence'] = pd.to_numeric(sentiment_raw['sentiment_confidence'], errors='coerce') sentiment_clean = sentiment_clean.dropna() # Merge with data for df in [train_data, valid_data, test_data]: df_temp = df.join(sentiment_clean, how='left') for col in ['prob_bullish', 'prob_bearish', 'prob_neutral', 'confidence']: df[col] = df_temp[col].fillna(method='ffill').fillna(method='bfill').fillna(0.33 if col != 'confidence' else 0.5) df['sentiment_net'] = df['prob_bullish'] - df['prob_bearish'] df['sentiment_strength'] = (df['prob_bullish'] - df['prob_bearish']).abs() df['sentiment_weighted'] = df['sentiment_net'] * df['confidence'] print(f"✅ Sentiment loaded: {len(sentiment_clean):,} records") print(f"✅ Features added: 7 sentiment features") except Exception as e: print(f"⚠️ Sentiment not loaded: {e}") for df in [train_data, valid_data, test_data]: df['sentiment_net'] = 0 df['sentiment_strength'] = 0 df['sentiment_weighted'] = 0 print("="*70) # %% # ============================================================================ # CELL 4: NORMALIZE + CREATE ENVIRONMENTS # ============================================================================ from sklearn.preprocessing import StandardScaler print("="*70) print(" NORMALIZING DATA + CREATING ENVIRONMENTS") print("="*70) # Get feature columns (all except OHLCV) feature_cols = [col for col in train_data.columns if col not in ['open', 'high', 'low', 'close', 'volume']] print(f"📊 Total features: {len(feature_cols)}") # Fit scaler on TRAIN ONLY scaler = StandardScaler() train_data[feature_cols] = scaler.fit_transform(train_data[feature_cols]) valid_data[feature_cols] = scaler.transform(valid_data[feature_cols]) test_data[feature_cols] = scaler.transform(test_data[feature_cols]) # Clip extreme values for df in [train_data, valid_data, test_data]: df[feature_cols] = df[feature_cols].clip(-5, 5) print("✅ Normalization complete (fitted on train only)") # Create environments train_env = BitcoinTradingEnv(train_data, episode_length=500) valid_env = BitcoinTradingEnv(valid_data, episode_length=500) test_env = BitcoinTradingEnv(test_data, episode_length=500) state_dim = train_env.observation_space.shape[0] action_dim = 1 print(f"\n✅ Environments created:") print(f" State dim: {state_dim}") print(f" Action dim: {action_dim}") print(f" Train episodes: ~{len(train_data)//500}") print("="*70) # %% # ============================================================================ # CELL 5: PYTORCH SAC AGENT (GPU OPTIMIZED) # ============================================================================ import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.distributions import Normal print("="*70) print(" PYTORCH SAC AGENT") print("="*70) # ============================================================================ # ACTOR NETWORK # ============================================================================ class Actor(nn.Module): def __init__(self, state_dim, action_dim, hidden_dim=256): super().__init__() self.fc1 = nn.Linear(state_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim, hidden_dim) self.fc3 = nn.Linear(hidden_dim, hidden_dim) self.mean = nn.Linear(hidden_dim, action_dim) self.log_std = nn.Linear(hidden_dim, action_dim) self.LOG_STD_MIN = -20 self.LOG_STD_MAX = 2 def forward(self, state): x = F.relu(self.fc1(state)) x = F.relu(self.fc2(x)) x = F.relu(self.fc3(x)) mean = self.mean(x) log_std = self.log_std(x) log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX) return mean, log_std def sample(self, state): mean, log_std = self.forward(state) std = log_std.exp() normal = Normal(mean, std) x_t = normal.rsample() # Reparameterization trick action = torch.tanh(x_t) # Log prob with tanh correction log_prob = normal.log_prob(x_t) log_prob -= torch.log(1 - action.pow(2) + 1e-6) log_prob = log_prob.sum(dim=-1, keepdim=True) return action, log_prob, mean # ============================================================================ # CRITIC NETWORK # ============================================================================ class Critic(nn.Module): def __init__(self, state_dim, action_dim, hidden_dim=256): super().__init__() # Q1 self.fc1_1 = nn.Linear(state_dim + action_dim, hidden_dim) self.fc1_2 = nn.Linear(hidden_dim, hidden_dim) self.fc1_3 = nn.Linear(hidden_dim, hidden_dim) self.fc1_out = nn.Linear(hidden_dim, 1) # Q2 self.fc2_1 = nn.Linear(state_dim + action_dim, hidden_dim) self.fc2_2 = nn.Linear(hidden_dim, hidden_dim) self.fc2_3 = nn.Linear(hidden_dim, hidden_dim) self.fc2_out = nn.Linear(hidden_dim, 1) def forward(self, state, action): x = torch.cat([state, action], dim=-1) q1 = F.relu(self.fc1_1(x)) q1 = F.relu(self.fc1_2(q1)) q1 = F.relu(self.fc1_3(q1)) q1 = self.fc1_out(q1) q2 = F.relu(self.fc2_1(x)) q2 = F.relu(self.fc2_2(q2)) q2 = F.relu(self.fc2_3(q2)) q2 = self.fc2_out(q2) return q1, q2 def q1(self, state, action): x = torch.cat([state, action], dim=-1) q1 = F.relu(self.fc1_1(x)) q1 = F.relu(self.fc1_2(q1)) q1 = F.relu(self.fc1_3(q1)) return self.fc1_out(q1) # ============================================================================ # SAC AGENT # ============================================================================ class SACAgent: def __init__(self, state_dim, action_dim, device, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, gamma=0.99, tau=0.005, initial_alpha=0.2): self.device = device self.gamma = gamma self.tau = tau self.action_dim = action_dim # Networks self.actor = Actor(state_dim, action_dim).to(device) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) # Optimizers self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) # Entropy (auto-tuning alpha) self.target_entropy = -action_dim self.log_alpha = torch.tensor(np.log(initial_alpha), requires_grad=True, device=device) self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) @property def alpha(self): return self.log_alpha.exp() def select_action(self, state, deterministic=False): with torch.no_grad(): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) if deterministic: mean, _ = self.actor(state) action = torch.tanh(mean) else: action, _, _ = self.actor.sample(state) return action.cpu().numpy()[0] def update(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) # ============ Update Critic ============ with torch.no_grad(): next_actions, next_log_probs, _ = self.actor.sample(next_states) q1_target, q2_target = self.critic_target(next_states, next_actions) q_target = torch.min(q1_target, q2_target) target_q = rewards + (1 - dones) * self.gamma * (q_target - self.alpha * next_log_probs) q1, q2 = self.critic(states, actions) critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0) self.critic_optimizer.step() # ============ Update Actor ============ new_actions, log_probs, _ = self.actor.sample(states) q1_new, q2_new = self.critic(states, new_actions) q_new = torch.min(q1_new, q2_new) actor_loss = (self.alpha.detach() * log_probs - q_new).mean() self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1.0) self.actor_optimizer.step() # ============ Update Alpha ============ alpha_loss = -(self.log_alpha * (log_probs + self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() # ============ Update Target ============ for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) return { 'critic_loss': critic_loss.item(), 'actor_loss': actor_loss.item(), 'alpha': self.alpha.item(), 'q_value': q1.mean().item() } def save(self, path): torch.save({ 'actor': self.actor.state_dict(), 'critic': self.critic.state_dict(), 'critic_target': self.critic_target.state_dict(), 'log_alpha': self.log_alpha, }, path) def load(self, path): checkpoint = torch.load(path) self.actor.load_state_dict(checkpoint['actor']) self.critic.load_state_dict(checkpoint['critic']) self.critic_target.load_state_dict(checkpoint['critic_target']) self.log_alpha = checkpoint['log_alpha'] print("✅ SACAgent class defined (PyTorch)") print("="*70) # %% # ============================================================================ # CELL 6: REPLAY BUFFER (GPU-FRIENDLY) # ============================================================================ print("="*70) print(" REPLAY BUFFER") print("="*70) class ReplayBuffer: def __init__(self, state_dim, action_dim, max_size=1_000_000): self.max_size = max_size self.ptr = 0 self.size = 0 self.states = np.zeros((max_size, state_dim), dtype=np.float32) self.actions = np.zeros((max_size, action_dim), dtype=np.float32) self.rewards = np.zeros((max_size, 1), dtype=np.float32) self.next_states = np.zeros((max_size, state_dim), dtype=np.float32) self.dones = np.zeros((max_size, 1), dtype=np.float32) mem_gb = (self.states.nbytes + self.actions.nbytes + self.rewards.nbytes + self.next_states.nbytes + self.dones.nbytes) / 1e9 print(f"📦 Buffer capacity: {max_size:,} | Memory: {mem_gb:.2f} GB") def add(self, state, action, reward, next_state, done): self.states[self.ptr] = state self.actions[self.ptr] = action self.rewards[self.ptr] = reward self.next_states[self.ptr] = next_state self.dones[self.ptr] = done self.ptr = (self.ptr + 1) % self.max_size self.size = min(self.size + 1, self.max_size) def sample(self, batch_size): idx = np.random.randint(0, self.size, size=batch_size) return ( self.states[idx], self.actions[idx], self.rewards[idx], self.next_states[idx], self.dones[idx] ) print("✅ ReplayBuffer defined") print("="*70) # %% # ============================================================================ # CELL 7: CREATE AGENT + BUFFER # ============================================================================ print("="*70) print(" CREATING AGENT + BUFFER") print("="*70) # Create SAC agent agent = SACAgent( state_dim=state_dim, action_dim=action_dim, device=device, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, gamma=0.99, tau=0.005, initial_alpha=0.2 ) # Create replay buffer buffer = ReplayBuffer( state_dim=state_dim, action_dim=action_dim, max_size=1_000_000 ) # Count parameters total_params = sum(p.numel() for p in agent.actor.parameters()) + \ sum(p.numel() for p in agent.critic.parameters()) print(f"\n✅ Agent created on {device}") print(f" Actor params: {sum(p.numel() for p in agent.actor.parameters()):,}") print(f" Critic params: {sum(p.numel() for p in agent.critic.parameters()):,}") print(f" Total params: {total_params:,}") print("="*70) # %% # ============================================================================ # CELL 8: TRAINING FUNCTION (GPU OPTIMIZED) # ============================================================================ from tqdm.notebook import tqdm import time print("="*70) print(" TRAINING FUNCTION") print("="*70) def train_sac(agent, env, valid_env, buffer, total_timesteps=700_000, warmup_steps=10_000, batch_size=1024, update_freq=1, save_path="sac_v9"): print(f"\n🚀 Training Configuration:") print(f" Total steps: {total_timesteps:,}") print(f" Warmup: {warmup_steps:,}") print(f" Batch size: {batch_size}") print(f" Device: {agent.device}") # Stats tracking episode_rewards = [] episode_lengths = [] eval_rewards = [] best_reward = -np.inf best_eval = -np.inf # Training stats critic_losses = [] actor_losses = [] q_values = [] state = env.reset() episode_reward = 0 episode_length = 0 episode_count = 0 total_trades = 0 start_time = time.time() pbar = tqdm(range(total_timesteps), desc="Training") for step in pbar: # Select action if step < warmup_steps: action = env.action_space.sample() else: action = agent.select_action(state, deterministic=False) # Step environment next_state, reward, done, info = env.step(action) # Store transition buffer.add(state, action, reward, next_state, float(done)) state = next_state episode_reward += reward episode_length += 1 # Update agent stats = None if step >= warmup_steps and step % update_freq == 0: batch = buffer.sample(batch_size) stats = agent.update(batch) critic_losses.append(stats['critic_loss']) actor_losses.append(stats['actor_loss']) q_values.append(stats['q_value']) # Episode end if done: episode_rewards.append(episode_reward) episode_lengths.append(episode_length) episode_count += 1 # Calculate episode stats final_value = info.get('total_value', 10000) pnl_pct = (final_value / 10000 - 1) * 100 # Get position distribution long_steps = info.get('long_steps', 0) short_steps = info.get('short_steps', 0) neutral_steps = info.get('neutral_steps', 0) total_active = long_steps + short_steps long_pct = (long_steps / total_active * 100) if total_active > 0 else 0 short_pct = (short_steps / total_active * 100) if total_active > 0 else 0 # Update progress bar with detailed info avg_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else episode_reward avg_q = np.mean(q_values[-100:]) if q_values else 0 avg_critic = np.mean(critic_losses[-100:]) if critic_losses else 0 pbar.set_postfix({ 'ep': episode_count, 'R': f'{episode_reward:.4f}', 'avg10': f'{avg_reward:.4f}', 'PnL%': f'{pnl_pct:+.2f}', 'L/S': f'{long_pct:.0f}/{short_pct:.0f}', 'α': f'{agent.alpha.item():.3f}', }) # ============ EVAL EVERY EPISODE ============ eval_reward, eval_pnl, eval_long_pct = evaluate_agent(agent, valid_env, n_episodes=1) eval_rewards.append(eval_reward) # Print detailed episode summary elapsed = time.time() - start_time steps_per_sec = (step + 1) / elapsed print(f"\n{'='*60}") print(f"📊 Episode {episode_count} Complete | Step {step+1:,}/{total_timesteps:,}") print(f"{'='*60}") print(f" 🎮 TRAIN:") print(f" Reward: {episode_reward:.4f} | PnL: {pnl_pct:+.2f}%") print(f" Length: {episode_length} steps") print(f" Avg (last 10): {avg_reward:.4f}") print(f" 📊 POSITION BALANCE:") print(f" Long: {long_steps} steps ({long_pct:.1f}%)") print(f" Short: {short_steps} steps ({short_pct:.1f}%)") print(f" Neutral: {neutral_steps} steps") if short_pct > 80: print(f" ⚠️ EXCESSIVE SHORTING - PENALTY APPLIED") print(f" 📈 EVAL (validation):") print(f" Reward: {eval_reward:.4f} | PnL: {eval_pnl:+.2f}%") print(f" Long%: {eval_long_pct:.1f}%") print(f" Avg (last 5): {np.mean(eval_rewards[-5:]):.4f}") print(f" 🧠 AGENT:") print(f" Alpha: {agent.alpha.item():.4f}") print(f" Q-value: {avg_q:.3f}") print(f" Critic loss: {avg_critic:.5f}") print(f" ⚡ Speed: {steps_per_sec:.0f} steps/sec") print(f" 💾 Buffer: {buffer.size:,} transitions") # Save best train if episode_reward > best_reward: best_reward = episode_reward agent.save(f"{save_path}_best_train.pt") print(f" 🏆 NEW BEST TRAIN: {best_reward:.4f}") # Save best eval if eval_reward > best_eval: best_eval = eval_reward agent.save(f"{save_path}_best_eval.pt") print(f" 🏆 NEW BEST EVAL: {best_eval:.4f}") # Reset state = env.reset() episode_reward = 0 episode_length = 0 # Final save agent.save(f"{save_path}_final.pt") total_time = time.time() - start_time print(f"\n{'='*70}") print(f" TRAINING COMPLETE") print(f"{'='*70}") print(f" Total time: {total_time/60:.1f} min") print(f" Episodes: {episode_count}") print(f" Best train reward: {best_reward:.4f}") print(f" Best eval reward: {best_eval:.4f}") print(f" Avg speed: {total_timesteps/total_time:.0f} steps/sec") return episode_rewards, eval_rewards def evaluate_agent(agent, env, n_episodes=1): """Run evaluation episodes""" total_reward = 0 total_pnl = 0 total_long_pct = 0 for _ in range(n_episodes): state = env.reset() episode_reward = 0 done = False while not done: action = agent.select_action(state, deterministic=True) state, reward, done, info = env.step(action) episode_reward += reward total_reward += episode_reward final_value = info.get('total_value', 10000) total_pnl += (final_value / 10000 - 1) * 100 # Calculate long percentage long_steps = info.get('long_steps', 0) short_steps = info.get('short_steps', 0) total_active = long_steps + short_steps total_long_pct += (long_steps / total_active * 100) if total_active > 0 else 0 return total_reward / n_episodes, total_pnl / n_episodes, total_long_pct / n_episodes print("✅ Training function ready (with per-episode eval + position tracking)") print("="*70) # %% # ============================================================================ # CELL 9: START TRAINING # ============================================================================ print("="*70) print(" STARTING SAC TRAINING") print("="*70) # Training parameters TOTAL_STEPS = 500_000 # 500K steps WARMUP_STEPS = 10_000 # 10K random warmup BATCH_SIZE = 256 # Standard batch size UPDATE_FREQ = 1 # Update every step print(f"\n📋 Configuration:") print(f" Steps: {TOTAL_STEPS:,}") print(f" Batch: {BATCH_SIZE}") print(f" Train env: {len(train_data):,} candles") print(f" Valid env: {len(valid_data):,} candles") print(f" Device: {device}") # Run training with validation eval every episode episode_rewards, eval_rewards = train_sac( agent=agent, env=train_env, valid_env=valid_env, buffer=buffer, total_timesteps=TOTAL_STEPS, warmup_steps=WARMUP_STEPS, batch_size=BATCH_SIZE, update_freq=UPDATE_FREQ, save_path="sac_v9_pytorch" ) print("\n" + "="*70) print(" TRAINING COMPLETE") print("="*70)