|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
!pip install -q ta
|
|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
import torch.optim as optim
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import warnings
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
print("="*70)
|
|
|
print(" PYTORCH GPU SETUP (30GB GPU)")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
if torch.cuda.is_available():
|
|
|
|
|
|
gpu_name = torch.cuda.get_device_name(0)
|
|
|
gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
|
|
|
|
|
|
print(f"✅ GPU: {gpu_name}")
|
|
|
print(f"✅ GPU Memory: {gpu_mem:.1f} GB")
|
|
|
|
|
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
torch.backends.cudnn.allow_tf32 = True
|
|
|
print("✅ TF32: Enabled (2-3x speedup on Ampere)")
|
|
|
|
|
|
|
|
|
torch.backends.cudnn.benchmark = True
|
|
|
print("✅ cuDNN benchmark: Enabled")
|
|
|
|
|
|
|
|
|
torch.set_default_device('cuda')
|
|
|
print("✅ Default device: CUDA")
|
|
|
|
|
|
else:
|
|
|
print("⚠️ No GPU detected, using CPU")
|
|
|
|
|
|
print(f"\n✅ PyTorch: {torch.__version__}")
|
|
|
print(f"✅ Device: {device}")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import gym
|
|
|
from gym import spaces
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
from ta.momentum import RSIIndicator, StochasticOscillator, ROCIndicator, WilliamsRIndicator
|
|
|
from ta.trend import MACD, EMAIndicator, SMAIndicator, ADXIndicator, CCIIndicator
|
|
|
from ta.volatility import BollingerBands, AverageTrueRange
|
|
|
from ta.volume import OnBalanceVolumeIndicator
|
|
|
import os
|
|
|
|
|
|
print("="*70)
|
|
|
print(" LOADING DATA + FEATURES")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_path = '/kaggle/input/bitcoin-historical-datasets-2018-2024/'
|
|
|
btc_data = pd.read_csv(data_path + 'btc_15m_data_2018_to_2025.csv')
|
|
|
|
|
|
column_mapping = {'Open time': 'timestamp', 'Open': 'open', 'High': 'high',
|
|
|
'Low': 'low', 'Close': 'close', 'Volume': 'volume'}
|
|
|
btc_data = btc_data.rename(columns=column_mapping)
|
|
|
btc_data['timestamp'] = pd.to_datetime(btc_data['timestamp'])
|
|
|
btc_data.set_index('timestamp', inplace=True)
|
|
|
btc_data = btc_data[['open', 'high', 'low', 'close', 'volume']]
|
|
|
|
|
|
for col in btc_data.columns:
|
|
|
btc_data[col] = pd.to_numeric(btc_data[col], errors='coerce')
|
|
|
|
|
|
btc_data = btc_data[btc_data.index >= '2021-01-01']
|
|
|
btc_data = btc_data[~btc_data.index.duplicated(keep='first')]
|
|
|
btc_data = btc_data.replace(0, np.nan).dropna().sort_index()
|
|
|
|
|
|
print(f"✅ BTC Data: {len(btc_data):,} candles")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fgi_loaded = False
|
|
|
|
|
|
try:
|
|
|
fgi_path = '/kaggle/input/btc-usdt-4h-ohlc-fgi-daily-2020/'
|
|
|
files = os.listdir(fgi_path)
|
|
|
|
|
|
for filename in files:
|
|
|
if filename.endswith('.csv'):
|
|
|
fgi_data = pd.read_csv(fgi_path + filename)
|
|
|
|
|
|
|
|
|
time_col = [c for c in fgi_data.columns if 'time' in c.lower() or 'date' in c.lower()]
|
|
|
if time_col:
|
|
|
fgi_data['timestamp'] = pd.to_datetime(fgi_data[time_col[0]])
|
|
|
else:
|
|
|
fgi_data['timestamp'] = pd.to_datetime(fgi_data.iloc[:, 0])
|
|
|
|
|
|
fgi_data.set_index('timestamp', inplace=True)
|
|
|
|
|
|
|
|
|
fgi_col = [c for c in fgi_data.columns if 'fgi' in c.lower() or 'fear' in c.lower() or 'greed' in c.lower()]
|
|
|
if fgi_col:
|
|
|
fgi_data = fgi_data[[fgi_col[0]]].rename(columns={fgi_col[0]: 'fgi'})
|
|
|
fgi_loaded = True
|
|
|
print(f"✅ Fear & Greed loaded: {len(fgi_data):,} values")
|
|
|
break
|
|
|
except:
|
|
|
pass
|
|
|
|
|
|
if not fgi_loaded:
|
|
|
fgi_data = pd.DataFrame(index=btc_data.index)
|
|
|
fgi_data['fgi'] = 50
|
|
|
print("⚠️ Using neutral FGI values")
|
|
|
|
|
|
|
|
|
btc_data = btc_data.join(fgi_data, how='left')
|
|
|
btc_data['fgi'] = btc_data['fgi'].fillna(method='ffill').fillna(method='bfill').fillna(50)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("🔧 Calculating indicators...")
|
|
|
data = btc_data.copy()
|
|
|
|
|
|
|
|
|
data['rsi_14'] = RSIIndicator(close=data['close'], window=14).rsi() / 100
|
|
|
data['rsi_7'] = RSIIndicator(close=data['close'], window=7).rsi() / 100
|
|
|
|
|
|
stoch = StochasticOscillator(high=data['high'], low=data['low'], close=data['close'], window=14)
|
|
|
data['stoch_k'] = stoch.stoch() / 100
|
|
|
data['stoch_d'] = stoch.stoch_signal() / 100
|
|
|
|
|
|
roc = ROCIndicator(close=data['close'], window=12)
|
|
|
data['roc_12'] = np.tanh(roc.roc() / 100)
|
|
|
|
|
|
williams = WilliamsRIndicator(high=data['high'], low=data['low'], close=data['close'], lbp=14)
|
|
|
data['williams_r'] = (williams.williams_r() + 100) / 100
|
|
|
|
|
|
macd = MACD(close=data['close'])
|
|
|
data['macd'] = np.tanh(macd.macd() / data['close'] * 100)
|
|
|
data['macd_signal'] = np.tanh(macd.macd_signal() / data['close'] * 100)
|
|
|
data['macd_diff'] = np.tanh(macd.macd_diff() / data['close'] * 100)
|
|
|
|
|
|
|
|
|
data['sma_20'] = SMAIndicator(close=data['close'], window=20).sma_indicator()
|
|
|
data['sma_50'] = SMAIndicator(close=data['close'], window=50).sma_indicator()
|
|
|
data['ema_12'] = EMAIndicator(close=data['close'], window=12).ema_indicator()
|
|
|
data['ema_26'] = EMAIndicator(close=data['close'], window=26).ema_indicator()
|
|
|
|
|
|
data['price_vs_sma20'] = (data['close'] - data['sma_20']) / data['sma_20']
|
|
|
data['price_vs_sma50'] = (data['close'] - data['sma_50']) / data['sma_50']
|
|
|
|
|
|
adx = ADXIndicator(high=data['high'], low=data['low'], close=data['close'], window=14)
|
|
|
data['adx'] = adx.adx() / 100
|
|
|
data['adx_pos'] = adx.adx_pos() / 100
|
|
|
data['adx_neg'] = adx.adx_neg() / 100
|
|
|
|
|
|
cci = CCIIndicator(high=data['high'], low=data['low'], close=data['close'], window=20)
|
|
|
data['cci'] = np.tanh(cci.cci() / 100)
|
|
|
|
|
|
|
|
|
bb = BollingerBands(close=data['close'], window=20, window_dev=2)
|
|
|
data['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / bb.bollinger_mavg()
|
|
|
data['bb_position'] = (data['close'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())
|
|
|
|
|
|
atr = AverageTrueRange(high=data['high'], low=data['low'], close=data['close'], window=14)
|
|
|
data['atr_percent'] = atr.average_true_range() / data['close']
|
|
|
|
|
|
|
|
|
data['volume_ma_20'] = data['volume'].rolling(20).mean()
|
|
|
data['volume_ratio'] = data['volume'] / (data['volume_ma_20'] + 1e-8)
|
|
|
|
|
|
obv = OnBalanceVolumeIndicator(close=data['close'], volume=data['volume'])
|
|
|
data['obv_slope'] = (obv.on_balance_volume().diff(5) / (obv.on_balance_volume().shift(5).abs() + 1e-8))
|
|
|
|
|
|
|
|
|
data['returns_1'] = data['close'].pct_change()
|
|
|
data['returns_5'] = data['close'].pct_change(5)
|
|
|
data['returns_20'] = data['close'].pct_change(20)
|
|
|
data['volatility_20'] = data['returns_1'].rolling(20).std()
|
|
|
|
|
|
data['body_size'] = abs(data['close'] - data['open']) / (data['open'] + 1e-8)
|
|
|
data['high_20'] = data['high'].rolling(20).max()
|
|
|
data['low_20'] = data['low'].rolling(20).min()
|
|
|
data['price_position'] = (data['close'] - data['low_20']) / (data['high_20'] - data['low_20'] + 1e-8)
|
|
|
|
|
|
|
|
|
data['fgi_normalized'] = (data['fgi'] - 50) / 50
|
|
|
data['fgi_change'] = data['fgi'].diff() / 50
|
|
|
data['fgi_ma7'] = data['fgi'].rolling(7).mean()
|
|
|
data['fgi_vs_ma'] = (data['fgi'] - data['fgi_ma7']) / 50
|
|
|
|
|
|
|
|
|
data['hour'] = data.index.hour / 24
|
|
|
data['day_of_week'] = data.index.dayofweek / 7
|
|
|
data['us_session'] = ((data.index.hour >= 14) & (data.index.hour < 21)).astype(float)
|
|
|
|
|
|
btc_features = data.dropna()
|
|
|
feature_cols = [col for col in btc_features.columns if col not in ['open', 'high', 'low', 'close', 'volume']]
|
|
|
|
|
|
print(f"✅ Features: {len(feature_cols)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_size = int(len(btc_features) * 0.70)
|
|
|
valid_size = int(len(btc_features) * 0.15)
|
|
|
|
|
|
train_data = btc_features.iloc[:train_size].copy()
|
|
|
valid_data = btc_features.iloc[train_size:train_size+valid_size].copy()
|
|
|
test_data = btc_features.iloc[train_size+valid_size:].copy()
|
|
|
|
|
|
print(f"\n📊 Train: {len(train_data):,} | Valid: {len(valid_data):,} | Test: {len(test_data):,}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BitcoinTradingEnv(gym.Env):
|
|
|
def __init__(self, df, initial_balance=10000, episode_length=500, transaction_fee=0.0,
|
|
|
long_bonus=0.0001, short_penalty_threshold=0.8, short_penalty=0.05):
|
|
|
super().__init__()
|
|
|
self.df = df.reset_index(drop=True)
|
|
|
self.initial_balance = initial_balance
|
|
|
self.episode_length = episode_length
|
|
|
self.transaction_fee = transaction_fee
|
|
|
|
|
|
|
|
|
self.long_bonus = long_bonus
|
|
|
self.short_penalty_threshold = short_penalty_threshold
|
|
|
self.short_penalty = short_penalty
|
|
|
|
|
|
self.feature_cols = [col for col in df.columns
|
|
|
if col not in ['open', 'high', 'low', 'close', 'volume']]
|
|
|
|
|
|
self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
|
|
|
self.observation_space = spaces.Box(
|
|
|
low=-10, high=10,
|
|
|
shape=(len(self.feature_cols) + 5,),
|
|
|
dtype=np.float32
|
|
|
)
|
|
|
self.reset()
|
|
|
|
|
|
def reset(self):
|
|
|
max_start = len(self.df) - self.episode_length - 1
|
|
|
self.start_idx = np.random.randint(100, max(101, max_start))
|
|
|
|
|
|
self.current_step = 0
|
|
|
self.balance = self.initial_balance
|
|
|
self.position = 0.0
|
|
|
self.entry_price = 0.0
|
|
|
self.total_value = self.initial_balance
|
|
|
self.prev_total_value = self.initial_balance
|
|
|
self.max_value = self.initial_balance
|
|
|
|
|
|
|
|
|
self.long_steps = 0
|
|
|
self.short_steps = 0
|
|
|
self.neutral_steps = 0
|
|
|
|
|
|
return self._get_obs()
|
|
|
|
|
|
def _get_obs(self):
|
|
|
idx = self.start_idx + self.current_step
|
|
|
features = self.df.loc[idx, self.feature_cols].values
|
|
|
|
|
|
total_return = (self.total_value / self.initial_balance) - 1
|
|
|
drawdown = (self.max_value - self.total_value) / self.max_value if self.max_value > 0 else 0
|
|
|
|
|
|
portfolio_info = np.array([
|
|
|
self.position,
|
|
|
total_return,
|
|
|
drawdown,
|
|
|
self.df.loc[idx, 'returns_1'],
|
|
|
self.df.loc[idx, 'rsi_14']
|
|
|
], dtype=np.float32)
|
|
|
|
|
|
obs = np.concatenate([features, portfolio_info])
|
|
|
return np.clip(obs, -10, 10).astype(np.float32)
|
|
|
|
|
|
def step(self, action):
|
|
|
idx = self.start_idx + self.current_step
|
|
|
current_price = self.df.loc[idx, 'close']
|
|
|
target_position = np.clip(action[0], -1.0, 1.0)
|
|
|
|
|
|
self.prev_total_value = self.total_value
|
|
|
|
|
|
if abs(target_position - self.position) > 0.1:
|
|
|
if self.position != 0:
|
|
|
self._close_position(current_price)
|
|
|
if abs(target_position) > 0.1:
|
|
|
self._open_position(target_position, current_price)
|
|
|
|
|
|
self._update_total_value(current_price)
|
|
|
self.max_value = max(self.max_value, self.total_value)
|
|
|
|
|
|
|
|
|
if self.position > 0.1:
|
|
|
self.long_steps += 1
|
|
|
elif self.position < -0.1:
|
|
|
self.short_steps += 1
|
|
|
else:
|
|
|
self.neutral_steps += 1
|
|
|
|
|
|
self.current_step += 1
|
|
|
done = (self.current_step >= self.episode_length) or (self.total_value <= self.initial_balance * 0.5)
|
|
|
|
|
|
|
|
|
|
|
|
reward = (self.total_value - self.prev_total_value) / self.initial_balance
|
|
|
|
|
|
|
|
|
if self.position > 0.1:
|
|
|
reward += self.long_bonus
|
|
|
|
|
|
|
|
|
if done:
|
|
|
total_active_steps = self.long_steps + self.short_steps
|
|
|
if total_active_steps > 0:
|
|
|
short_ratio = self.short_steps / total_active_steps
|
|
|
if short_ratio > self.short_penalty_threshold:
|
|
|
|
|
|
reward -= self.short_penalty * (short_ratio - self.short_penalty_threshold) / (1 - self.short_penalty_threshold)
|
|
|
|
|
|
obs = self._get_obs()
|
|
|
info = {
|
|
|
'total_value': self.total_value,
|
|
|
'position': self.position,
|
|
|
'long_steps': self.long_steps,
|
|
|
'short_steps': self.short_steps,
|
|
|
'neutral_steps': self.neutral_steps
|
|
|
}
|
|
|
|
|
|
return obs, reward, done, info
|
|
|
|
|
|
def _update_total_value(self, current_price):
|
|
|
if self.position != 0:
|
|
|
if self.position > 0:
|
|
|
pnl = self.position * self.initial_balance * (current_price / self.entry_price - 1)
|
|
|
else:
|
|
|
pnl = abs(self.position) * self.initial_balance * (1 - current_price / self.entry_price)
|
|
|
self.total_value = self.balance + pnl
|
|
|
else:
|
|
|
self.total_value = self.balance
|
|
|
|
|
|
def _open_position(self, size, price):
|
|
|
self.position = size
|
|
|
self.entry_price = price
|
|
|
|
|
|
def _close_position(self, price):
|
|
|
if self.position > 0:
|
|
|
pnl = self.position * self.initial_balance * (price / self.entry_price - 1)
|
|
|
else:
|
|
|
pnl = abs(self.position) * self.initial_balance * (1 - price / self.entry_price)
|
|
|
|
|
|
pnl -= abs(pnl) * self.transaction_fee
|
|
|
self.balance += pnl
|
|
|
self.position = 0.0
|
|
|
|
|
|
print("✅ Environment class ready (with anti-short bias)")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" LOADING SENTIMENT DATA")
|
|
|
print("="*70)
|
|
|
|
|
|
sentiment_file = '/kaggle/input/bitcoin-news-with-sentimen/bitcoin_news_3hour_intervals_with_sentiment.csv'
|
|
|
|
|
|
try:
|
|
|
sentiment_raw = pd.read_csv(sentiment_file)
|
|
|
|
|
|
def parse_time_range(time_str):
|
|
|
parts = str(time_str).split(' ')
|
|
|
if len(parts) >= 2:
|
|
|
date = parts[0]
|
|
|
time_range = parts[1]
|
|
|
start_time = time_range.split('-')[0]
|
|
|
return f"{date} {start_time}:00"
|
|
|
return time_str
|
|
|
|
|
|
sentiment_raw['timestamp'] = sentiment_raw['time_interval'].apply(parse_time_range)
|
|
|
sentiment_raw['timestamp'] = pd.to_datetime(sentiment_raw['timestamp'])
|
|
|
sentiment_raw = sentiment_raw.set_index('timestamp').sort_index()
|
|
|
|
|
|
sentiment_clean = pd.DataFrame(index=sentiment_raw.index)
|
|
|
sentiment_clean['prob_bullish'] = pd.to_numeric(sentiment_raw['prob_bullish'], errors='coerce')
|
|
|
sentiment_clean['prob_bearish'] = pd.to_numeric(sentiment_raw['prob_bearish'], errors='coerce')
|
|
|
sentiment_clean['prob_neutral'] = pd.to_numeric(sentiment_raw['prob_neutral'], errors='coerce')
|
|
|
sentiment_clean['confidence'] = pd.to_numeric(sentiment_raw['sentiment_confidence'], errors='coerce')
|
|
|
sentiment_clean = sentiment_clean.dropna()
|
|
|
|
|
|
|
|
|
for df in [train_data, valid_data, test_data]:
|
|
|
df_temp = df.join(sentiment_clean, how='left')
|
|
|
for col in ['prob_bullish', 'prob_bearish', 'prob_neutral', 'confidence']:
|
|
|
df[col] = df_temp[col].fillna(method='ffill').fillna(method='bfill').fillna(0.33 if col != 'confidence' else 0.5)
|
|
|
|
|
|
df['sentiment_net'] = df['prob_bullish'] - df['prob_bearish']
|
|
|
df['sentiment_strength'] = (df['prob_bullish'] - df['prob_bearish']).abs()
|
|
|
df['sentiment_weighted'] = df['sentiment_net'] * df['confidence']
|
|
|
|
|
|
print(f"✅ Sentiment loaded: {len(sentiment_clean):,} records")
|
|
|
print(f"✅ Features added: 7 sentiment features")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"⚠️ Sentiment not loaded: {e}")
|
|
|
for df in [train_data, valid_data, test_data]:
|
|
|
df['sentiment_net'] = 0
|
|
|
df['sentiment_strength'] = 0
|
|
|
df['sentiment_weighted'] = 0
|
|
|
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
|
print("="*70)
|
|
|
print(" NORMALIZING DATA + CREATING ENVIRONMENTS")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
feature_cols = [col for col in train_data.columns
|
|
|
if col not in ['open', 'high', 'low', 'close', 'volume']]
|
|
|
|
|
|
print(f"📊 Total features: {len(feature_cols)}")
|
|
|
|
|
|
|
|
|
scaler = StandardScaler()
|
|
|
train_data[feature_cols] = scaler.fit_transform(train_data[feature_cols])
|
|
|
valid_data[feature_cols] = scaler.transform(valid_data[feature_cols])
|
|
|
test_data[feature_cols] = scaler.transform(test_data[feature_cols])
|
|
|
|
|
|
|
|
|
for df in [train_data, valid_data, test_data]:
|
|
|
df[feature_cols] = df[feature_cols].clip(-5, 5)
|
|
|
|
|
|
print("✅ Normalization complete (fitted on train only)")
|
|
|
|
|
|
|
|
|
train_env = BitcoinTradingEnv(train_data, episode_length=500)
|
|
|
valid_env = BitcoinTradingEnv(valid_data, episode_length=500)
|
|
|
test_env = BitcoinTradingEnv(test_data, episode_length=500)
|
|
|
|
|
|
state_dim = train_env.observation_space.shape[0]
|
|
|
action_dim = 1
|
|
|
|
|
|
print(f"\n✅ Environments created:")
|
|
|
print(f" State dim: {state_dim}")
|
|
|
print(f" Action dim: {action_dim}")
|
|
|
print(f" Train episodes: ~{len(train_data)//500}")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
import torch.optim as optim
|
|
|
from torch.distributions import Normal
|
|
|
|
|
|
print("="*70)
|
|
|
print(" PYTORCH SAC AGENT")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Actor(nn.Module):
|
|
|
def __init__(self, state_dim, action_dim, hidden_dim=256):
|
|
|
super().__init__()
|
|
|
self.fc1 = nn.Linear(state_dim, hidden_dim)
|
|
|
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc3 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
|
|
|
self.mean = nn.Linear(hidden_dim, action_dim)
|
|
|
self.log_std = nn.Linear(hidden_dim, action_dim)
|
|
|
|
|
|
self.LOG_STD_MIN = -20
|
|
|
self.LOG_STD_MAX = 2
|
|
|
|
|
|
def forward(self, state):
|
|
|
x = F.relu(self.fc1(state))
|
|
|
x = F.relu(self.fc2(x))
|
|
|
x = F.relu(self.fc3(x))
|
|
|
|
|
|
mean = self.mean(x)
|
|
|
log_std = self.log_std(x)
|
|
|
log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)
|
|
|
|
|
|
return mean, log_std
|
|
|
|
|
|
def sample(self, state):
|
|
|
mean, log_std = self.forward(state)
|
|
|
std = log_std.exp()
|
|
|
|
|
|
normal = Normal(mean, std)
|
|
|
x_t = normal.rsample()
|
|
|
action = torch.tanh(x_t)
|
|
|
|
|
|
|
|
|
log_prob = normal.log_prob(x_t)
|
|
|
log_prob -= torch.log(1 - action.pow(2) + 1e-6)
|
|
|
log_prob = log_prob.sum(dim=-1, keepdim=True)
|
|
|
|
|
|
return action, log_prob, mean
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Critic(nn.Module):
|
|
|
def __init__(self, state_dim, action_dim, hidden_dim=256):
|
|
|
super().__init__()
|
|
|
|
|
|
self.fc1_1 = nn.Linear(state_dim + action_dim, hidden_dim)
|
|
|
self.fc1_2 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc1_3 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc1_out = nn.Linear(hidden_dim, 1)
|
|
|
|
|
|
|
|
|
self.fc2_1 = nn.Linear(state_dim + action_dim, hidden_dim)
|
|
|
self.fc2_2 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc2_3 = nn.Linear(hidden_dim, hidden_dim)
|
|
|
self.fc2_out = nn.Linear(hidden_dim, 1)
|
|
|
|
|
|
def forward(self, state, action):
|
|
|
x = torch.cat([state, action], dim=-1)
|
|
|
|
|
|
q1 = F.relu(self.fc1_1(x))
|
|
|
q1 = F.relu(self.fc1_2(q1))
|
|
|
q1 = F.relu(self.fc1_3(q1))
|
|
|
q1 = self.fc1_out(q1)
|
|
|
|
|
|
q2 = F.relu(self.fc2_1(x))
|
|
|
q2 = F.relu(self.fc2_2(q2))
|
|
|
q2 = F.relu(self.fc2_3(q2))
|
|
|
q2 = self.fc2_out(q2)
|
|
|
|
|
|
return q1, q2
|
|
|
|
|
|
def q1(self, state, action):
|
|
|
x = torch.cat([state, action], dim=-1)
|
|
|
q1 = F.relu(self.fc1_1(x))
|
|
|
q1 = F.relu(self.fc1_2(q1))
|
|
|
q1 = F.relu(self.fc1_3(q1))
|
|
|
return self.fc1_out(q1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SACAgent:
|
|
|
def __init__(self, state_dim, action_dim, device,
|
|
|
actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4,
|
|
|
gamma=0.99, tau=0.005, initial_alpha=0.2):
|
|
|
|
|
|
self.device = device
|
|
|
self.gamma = gamma
|
|
|
self.tau = tau
|
|
|
self.action_dim = action_dim
|
|
|
|
|
|
|
|
|
self.actor = Actor(state_dim, action_dim).to(device)
|
|
|
self.critic = Critic(state_dim, action_dim).to(device)
|
|
|
self.critic_target = Critic(state_dim, action_dim).to(device)
|
|
|
self.critic_target.load_state_dict(self.critic.state_dict())
|
|
|
|
|
|
|
|
|
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
|
|
|
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
|
|
|
|
|
|
|
|
|
self.target_entropy = -action_dim
|
|
|
self.log_alpha = torch.tensor(np.log(initial_alpha), requires_grad=True, device=device)
|
|
|
self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr)
|
|
|
|
|
|
@property
|
|
|
def alpha(self):
|
|
|
return self.log_alpha.exp()
|
|
|
|
|
|
def select_action(self, state, deterministic=False):
|
|
|
with torch.no_grad():
|
|
|
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
|
|
|
if deterministic:
|
|
|
mean, _ = self.actor(state)
|
|
|
action = torch.tanh(mean)
|
|
|
else:
|
|
|
action, _, _ = self.actor.sample(state)
|
|
|
return action.cpu().numpy()[0]
|
|
|
|
|
|
def update(self, batch):
|
|
|
states, actions, rewards, next_states, dones = batch
|
|
|
|
|
|
states = torch.FloatTensor(states).to(self.device)
|
|
|
actions = torch.FloatTensor(actions).to(self.device)
|
|
|
rewards = torch.FloatTensor(rewards).to(self.device)
|
|
|
next_states = torch.FloatTensor(next_states).to(self.device)
|
|
|
dones = torch.FloatTensor(dones).to(self.device)
|
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
next_actions, next_log_probs, _ = self.actor.sample(next_states)
|
|
|
q1_target, q2_target = self.critic_target(next_states, next_actions)
|
|
|
q_target = torch.min(q1_target, q2_target)
|
|
|
target_q = rewards + (1 - dones) * self.gamma * (q_target - self.alpha * next_log_probs)
|
|
|
|
|
|
q1, q2 = self.critic(states, actions)
|
|
|
critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q)
|
|
|
|
|
|
self.critic_optimizer.zero_grad()
|
|
|
critic_loss.backward()
|
|
|
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0)
|
|
|
self.critic_optimizer.step()
|
|
|
|
|
|
|
|
|
new_actions, log_probs, _ = self.actor.sample(states)
|
|
|
q1_new, q2_new = self.critic(states, new_actions)
|
|
|
q_new = torch.min(q1_new, q2_new)
|
|
|
|
|
|
actor_loss = (self.alpha.detach() * log_probs - q_new).mean()
|
|
|
|
|
|
self.actor_optimizer.zero_grad()
|
|
|
actor_loss.backward()
|
|
|
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1.0)
|
|
|
self.actor_optimizer.step()
|
|
|
|
|
|
|
|
|
alpha_loss = -(self.log_alpha * (log_probs + self.target_entropy).detach()).mean()
|
|
|
|
|
|
self.alpha_optimizer.zero_grad()
|
|
|
alpha_loss.backward()
|
|
|
self.alpha_optimizer.step()
|
|
|
|
|
|
|
|
|
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
|
|
|
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
|
|
|
|
|
|
return {
|
|
|
'critic_loss': critic_loss.item(),
|
|
|
'actor_loss': actor_loss.item(),
|
|
|
'alpha': self.alpha.item(),
|
|
|
'q_value': q1.mean().item()
|
|
|
}
|
|
|
|
|
|
def save(self, path):
|
|
|
torch.save({
|
|
|
'actor': self.actor.state_dict(),
|
|
|
'critic': self.critic.state_dict(),
|
|
|
'critic_target': self.critic_target.state_dict(),
|
|
|
'log_alpha': self.log_alpha,
|
|
|
}, path)
|
|
|
|
|
|
def load(self, path):
|
|
|
checkpoint = torch.load(path)
|
|
|
self.actor.load_state_dict(checkpoint['actor'])
|
|
|
self.critic.load_state_dict(checkpoint['critic'])
|
|
|
self.critic_target.load_state_dict(checkpoint['critic_target'])
|
|
|
self.log_alpha = checkpoint['log_alpha']
|
|
|
|
|
|
print("✅ SACAgent class defined (PyTorch)")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" REPLAY BUFFER")
|
|
|
print("="*70)
|
|
|
|
|
|
class ReplayBuffer:
|
|
|
def __init__(self, state_dim, action_dim, max_size=1_000_000):
|
|
|
self.max_size = max_size
|
|
|
self.ptr = 0
|
|
|
self.size = 0
|
|
|
|
|
|
self.states = np.zeros((max_size, state_dim), dtype=np.float32)
|
|
|
self.actions = np.zeros((max_size, action_dim), dtype=np.float32)
|
|
|
self.rewards = np.zeros((max_size, 1), dtype=np.float32)
|
|
|
self.next_states = np.zeros((max_size, state_dim), dtype=np.float32)
|
|
|
self.dones = np.zeros((max_size, 1), dtype=np.float32)
|
|
|
|
|
|
mem_gb = (self.states.nbytes + self.actions.nbytes + self.rewards.nbytes +
|
|
|
self.next_states.nbytes + self.dones.nbytes) / 1e9
|
|
|
print(f"📦 Buffer capacity: {max_size:,} | Memory: {mem_gb:.2f} GB")
|
|
|
|
|
|
def add(self, state, action, reward, next_state, done):
|
|
|
self.states[self.ptr] = state
|
|
|
self.actions[self.ptr] = action
|
|
|
self.rewards[self.ptr] = reward
|
|
|
self.next_states[self.ptr] = next_state
|
|
|
self.dones[self.ptr] = done
|
|
|
|
|
|
self.ptr = (self.ptr + 1) % self.max_size
|
|
|
self.size = min(self.size + 1, self.max_size)
|
|
|
|
|
|
def sample(self, batch_size):
|
|
|
idx = np.random.randint(0, self.size, size=batch_size)
|
|
|
return (
|
|
|
self.states[idx],
|
|
|
self.actions[idx],
|
|
|
self.rewards[idx],
|
|
|
self.next_states[idx],
|
|
|
self.dones[idx]
|
|
|
)
|
|
|
|
|
|
print("✅ ReplayBuffer defined")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" CREATING AGENT + BUFFER")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
agent = SACAgent(
|
|
|
state_dim=state_dim,
|
|
|
action_dim=action_dim,
|
|
|
device=device,
|
|
|
actor_lr=3e-4,
|
|
|
critic_lr=3e-4,
|
|
|
alpha_lr=3e-4,
|
|
|
gamma=0.99,
|
|
|
tau=0.005,
|
|
|
initial_alpha=0.2
|
|
|
)
|
|
|
|
|
|
|
|
|
buffer = ReplayBuffer(
|
|
|
state_dim=state_dim,
|
|
|
action_dim=action_dim,
|
|
|
max_size=1_000_000
|
|
|
)
|
|
|
|
|
|
|
|
|
total_params = sum(p.numel() for p in agent.actor.parameters()) + \
|
|
|
sum(p.numel() for p in agent.critic.parameters())
|
|
|
|
|
|
print(f"\n✅ Agent created on {device}")
|
|
|
print(f" Actor params: {sum(p.numel() for p in agent.actor.parameters()):,}")
|
|
|
print(f" Critic params: {sum(p.numel() for p in agent.critic.parameters()):,}")
|
|
|
print(f" Total params: {total_params:,}")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from tqdm.notebook import tqdm
|
|
|
import time
|
|
|
|
|
|
print("="*70)
|
|
|
print(" TRAINING FUNCTION")
|
|
|
print("="*70)
|
|
|
|
|
|
def train_sac(agent, env, valid_env, buffer,
|
|
|
total_timesteps=700_000,
|
|
|
warmup_steps=10_000,
|
|
|
batch_size=1024,
|
|
|
update_freq=1,
|
|
|
save_path="sac_v9"):
|
|
|
|
|
|
print(f"\n🚀 Training Configuration:")
|
|
|
print(f" Total steps: {total_timesteps:,}")
|
|
|
print(f" Warmup: {warmup_steps:,}")
|
|
|
print(f" Batch size: {batch_size}")
|
|
|
print(f" Device: {agent.device}")
|
|
|
|
|
|
|
|
|
episode_rewards = []
|
|
|
episode_lengths = []
|
|
|
eval_rewards = []
|
|
|
best_reward = -np.inf
|
|
|
best_eval = -np.inf
|
|
|
|
|
|
|
|
|
critic_losses = []
|
|
|
actor_losses = []
|
|
|
q_values = []
|
|
|
|
|
|
state = env.reset()
|
|
|
episode_reward = 0
|
|
|
episode_length = 0
|
|
|
episode_count = 0
|
|
|
total_trades = 0
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
pbar = tqdm(range(total_timesteps), desc="Training")
|
|
|
|
|
|
for step in pbar:
|
|
|
|
|
|
if step < warmup_steps:
|
|
|
action = env.action_space.sample()
|
|
|
else:
|
|
|
action = agent.select_action(state, deterministic=False)
|
|
|
|
|
|
|
|
|
next_state, reward, done, info = env.step(action)
|
|
|
|
|
|
|
|
|
buffer.add(state, action, reward, next_state, float(done))
|
|
|
|
|
|
state = next_state
|
|
|
episode_reward += reward
|
|
|
episode_length += 1
|
|
|
|
|
|
|
|
|
stats = None
|
|
|
if step >= warmup_steps and step % update_freq == 0:
|
|
|
batch = buffer.sample(batch_size)
|
|
|
stats = agent.update(batch)
|
|
|
critic_losses.append(stats['critic_loss'])
|
|
|
actor_losses.append(stats['actor_loss'])
|
|
|
q_values.append(stats['q_value'])
|
|
|
|
|
|
|
|
|
if done:
|
|
|
episode_rewards.append(episode_reward)
|
|
|
episode_lengths.append(episode_length)
|
|
|
episode_count += 1
|
|
|
|
|
|
|
|
|
final_value = info.get('total_value', 10000)
|
|
|
pnl_pct = (final_value / 10000 - 1) * 100
|
|
|
|
|
|
|
|
|
long_steps = info.get('long_steps', 0)
|
|
|
short_steps = info.get('short_steps', 0)
|
|
|
neutral_steps = info.get('neutral_steps', 0)
|
|
|
total_active = long_steps + short_steps
|
|
|
long_pct = (long_steps / total_active * 100) if total_active > 0 else 0
|
|
|
short_pct = (short_steps / total_active * 100) if total_active > 0 else 0
|
|
|
|
|
|
|
|
|
avg_reward = np.mean(episode_rewards[-10:]) if len(episode_rewards) >= 10 else episode_reward
|
|
|
avg_q = np.mean(q_values[-100:]) if q_values else 0
|
|
|
avg_critic = np.mean(critic_losses[-100:]) if critic_losses else 0
|
|
|
|
|
|
pbar.set_postfix({
|
|
|
'ep': episode_count,
|
|
|
'R': f'{episode_reward:.4f}',
|
|
|
'avg10': f'{avg_reward:.4f}',
|
|
|
'PnL%': f'{pnl_pct:+.2f}',
|
|
|
'L/S': f'{long_pct:.0f}/{short_pct:.0f}',
|
|
|
'α': f'{agent.alpha.item():.3f}',
|
|
|
})
|
|
|
|
|
|
|
|
|
eval_reward, eval_pnl, eval_long_pct = evaluate_agent(agent, valid_env, n_episodes=1)
|
|
|
eval_rewards.append(eval_reward)
|
|
|
|
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
steps_per_sec = (step + 1) / elapsed
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
|
print(f"📊 Episode {episode_count} Complete | Step {step+1:,}/{total_timesteps:,}")
|
|
|
print(f"{'='*60}")
|
|
|
print(f" 🎮 TRAIN:")
|
|
|
print(f" Reward: {episode_reward:.4f} | PnL: {pnl_pct:+.2f}%")
|
|
|
print(f" Length: {episode_length} steps")
|
|
|
print(f" Avg (last 10): {avg_reward:.4f}")
|
|
|
print(f" 📊 POSITION BALANCE:")
|
|
|
print(f" Long: {long_steps} steps ({long_pct:.1f}%)")
|
|
|
print(f" Short: {short_steps} steps ({short_pct:.1f}%)")
|
|
|
print(f" Neutral: {neutral_steps} steps")
|
|
|
if short_pct > 80:
|
|
|
print(f" ⚠️ EXCESSIVE SHORTING - PENALTY APPLIED")
|
|
|
print(f" 📈 EVAL (validation):")
|
|
|
print(f" Reward: {eval_reward:.4f} | PnL: {eval_pnl:+.2f}%")
|
|
|
print(f" Long%: {eval_long_pct:.1f}%")
|
|
|
print(f" Avg (last 5): {np.mean(eval_rewards[-5:]):.4f}")
|
|
|
print(f" 🧠 AGENT:")
|
|
|
print(f" Alpha: {agent.alpha.item():.4f}")
|
|
|
print(f" Q-value: {avg_q:.3f}")
|
|
|
print(f" Critic loss: {avg_critic:.5f}")
|
|
|
print(f" ⚡ Speed: {steps_per_sec:.0f} steps/sec")
|
|
|
print(f" 💾 Buffer: {buffer.size:,} transitions")
|
|
|
|
|
|
|
|
|
if episode_reward > best_reward:
|
|
|
best_reward = episode_reward
|
|
|
agent.save(f"{save_path}_best_train.pt")
|
|
|
print(f" 🏆 NEW BEST TRAIN: {best_reward:.4f}")
|
|
|
|
|
|
|
|
|
if eval_reward > best_eval:
|
|
|
best_eval = eval_reward
|
|
|
agent.save(f"{save_path}_best_eval.pt")
|
|
|
print(f" 🏆 NEW BEST EVAL: {best_eval:.4f}")
|
|
|
|
|
|
|
|
|
state = env.reset()
|
|
|
episode_reward = 0
|
|
|
episode_length = 0
|
|
|
|
|
|
|
|
|
agent.save(f"{save_path}_final.pt")
|
|
|
|
|
|
total_time = time.time() - start_time
|
|
|
print(f"\n{'='*70}")
|
|
|
print(f" TRAINING COMPLETE")
|
|
|
print(f"{'='*70}")
|
|
|
print(f" Total time: {total_time/60:.1f} min")
|
|
|
print(f" Episodes: {episode_count}")
|
|
|
print(f" Best train reward: {best_reward:.4f}")
|
|
|
print(f" Best eval reward: {best_eval:.4f}")
|
|
|
print(f" Avg speed: {total_timesteps/total_time:.0f} steps/sec")
|
|
|
|
|
|
return episode_rewards, eval_rewards
|
|
|
|
|
|
|
|
|
def evaluate_agent(agent, env, n_episodes=1):
|
|
|
"""Run evaluation episodes"""
|
|
|
total_reward = 0
|
|
|
total_pnl = 0
|
|
|
total_long_pct = 0
|
|
|
|
|
|
for _ in range(n_episodes):
|
|
|
state = env.reset()
|
|
|
episode_reward = 0
|
|
|
done = False
|
|
|
|
|
|
while not done:
|
|
|
action = agent.select_action(state, deterministic=True)
|
|
|
state, reward, done, info = env.step(action)
|
|
|
episode_reward += reward
|
|
|
|
|
|
total_reward += episode_reward
|
|
|
final_value = info.get('total_value', 10000)
|
|
|
total_pnl += (final_value / 10000 - 1) * 100
|
|
|
|
|
|
|
|
|
long_steps = info.get('long_steps', 0)
|
|
|
short_steps = info.get('short_steps', 0)
|
|
|
total_active = long_steps + short_steps
|
|
|
total_long_pct += (long_steps / total_active * 100) if total_active > 0 else 0
|
|
|
|
|
|
return total_reward / n_episodes, total_pnl / n_episodes, total_long_pct / n_episodes
|
|
|
|
|
|
|
|
|
print("✅ Training function ready (with per-episode eval + position tracking)")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*70)
|
|
|
print(" STARTING SAC TRAINING")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
TOTAL_STEPS = 500_000
|
|
|
WARMUP_STEPS = 10_000
|
|
|
BATCH_SIZE = 256
|
|
|
UPDATE_FREQ = 1
|
|
|
|
|
|
print(f"\n📋 Configuration:")
|
|
|
print(f" Steps: {TOTAL_STEPS:,}")
|
|
|
print(f" Batch: {BATCH_SIZE}")
|
|
|
print(f" Train env: {len(train_data):,} candles")
|
|
|
print(f" Valid env: {len(valid_data):,} candles")
|
|
|
print(f" Device: {device}")
|
|
|
|
|
|
|
|
|
episode_rewards, eval_rewards = train_sac(
|
|
|
agent=agent,
|
|
|
env=train_env,
|
|
|
valid_env=valid_env,
|
|
|
buffer=buffer,
|
|
|
total_timesteps=TOTAL_STEPS,
|
|
|
warmup_steps=WARMUP_STEPS,
|
|
|
batch_size=BATCH_SIZE,
|
|
|
update_freq=UPDATE_FREQ,
|
|
|
save_path="sac_v9_pytorch"
|
|
|
)
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print(" TRAINING COMPLETE")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
|