import numpy as np import pandas as pd import math import logging try: import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader from sklearn.preprocessing import StandardScaler HAS_TORCH = True except ImportError: HAS_TORCH = False logger = logging.getLogger("portfolio_engine") class PositionalEncoding(nn.Module): def __init__(self, d_model: int, max_len: int = 5000): super().__init__() position = torch.arange(max_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) pe = torch.zeros(max_len, 1, d_model) pe[:, 0, 0::2] = torch.sin(position * div_term) pe[:, 0, 1::2] = torch.cos(position * div_term) self.register_buffer('pe', pe) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Args: x: Tensor, shape [seq_len, batch_size, embedding_dim] """ x = x + self.pe[:x.size(0)] return x class NoiseFilteredTransformer(nn.Module): def __init__(self, num_features, d_model=64, nhead=4, num_layers=2, dropout=0.1): super().__init__() self.d_model = d_model # 1. Noise Filter (1D Convolution) # Input shape expected: (batch, seq_len, num_features) # Conv1d expects: (batch, channels, length), so we will transpose self.noise_filter = nn.Conv1d( in_channels=num_features, out_channels=d_model, kernel_size=3, padding=1 ) self.filter_activation = nn.GELU() # 2. Positional Encoding self.pos_encoder = PositionalEncoding(d_model) # 3. Transformer Encoder encoder_layers = nn.TransformerEncoderLayer( d_model=d_model, nhead=nhead, dim_feedforward=d_model*4, dropout=dropout, batch_first=True ) self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers) # 4. Output Head self.fc_out = nn.Sequential( nn.Linear(d_model, d_model // 2), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_model // 2, 1) ) def forward(self, src): # src: (batch, seq_len, num_features) # Apply Conv1D Noise Filter # Transpose to (batch, features, seq_len) x = src.transpose(1, 2) x = self.noise_filter(x) x = self.filter_activation(x) # Transpose back to (batch, seq_len, d_model) x = x.transpose(1, 2) # Transformer expects (batch, seq_len, d_model) if batch_first=True # But our PositionalEncoding expects (seq_len, batch, d_model) x = x.transpose(0, 1) x = self.pos_encoder(x) x = x.transpose(0, 1) # Pass through Transformer x = self.transformer_encoder(x) # Global Average Pooling over the sequence x = x.mean(dim=1) # Output prediction out = self.fc_out(x) return out.squeeze(-1) class CrossAssetSequenceDataset(Dataset): def __init__(self, features_dict, seq_len=60, scaler=None, is_train=True): self.seq_len = seq_len self.samples = [] self.targets = [] all_features = [] # First pass: collect all data to fit scaler if needed for t, df in features_dict.items(): if df.empty or len(df) <= seq_len: continue feats = df.drop(columns=['target', 'ret'], errors='ignore').values all_features.append(feats) if not all_features: self.scaler = None return if scaler is None and is_train: self.scaler = StandardScaler() stacked_feats = np.vstack(all_features) self.scaler.fit(stacked_feats) else: self.scaler = scaler # Second pass: construct sliding windows for t, df in features_dict.items(): if df.empty or len(df) <= seq_len: continue feats = df.drop(columns=['target', 'ret'], errors='ignore').values if self.scaler is not None: feats = self.scaler.transform(feats) targets = df['target'].values if 'target' in df.columns else np.zeros(len(df)) # Create overlapping sequences for i in range(len(df) - seq_len): seq = feats[i : i + seq_len] target = targets[i + seq_len] # predicting the target at the end of the window # Only keep non-NaN targets for training if is_train and np.isnan(target): continue self.samples.append(seq) self.targets.append(target) def __len__(self): return len(self.samples) def __getitem__(self, idx): x = torch.tensor(self.samples[idx], dtype=torch.float32) y = torch.tensor(self.targets[idx], dtype=torch.float32) return x, y def train_cross_asset_transformer(features_dict, seq_len=60, epochs=10, batch_size=256, device=None, silent=False): """ Trains a global Cross-Asset Transformer sequence model. """ if not HAS_TORCH: if not silent: logger.warning("PyTorch not installed. Cannot train Transformer.") return None, None if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Split into train/val conceptually (80/20 by time for each asset) train_dict = {} val_dict = {} for t, df in features_dict.items(): if len(df) > seq_len + 21: split_idx = int(len(df) * 0.8) train_dict[t] = df.iloc[:split_idx] val_dict[t] = df.iloc[split_idx:] train_dataset = CrossAssetSequenceDataset(train_dict, seq_len=seq_len, is_train=True) if len(train_dataset) == 0: return None, None val_dataset = CrossAssetSequenceDataset(val_dict, seq_len=seq_len, scaler=train_dataset.scaler, is_train=False) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) num_features = train_dataset.samples[0].shape[1] model = NoiseFilteredTransformer( num_features=num_features, d_model=64, nhead=4, num_layers=2, dropout=0.1 ).to(device) criterion = nn.MSELoss() # Using AdamW with weight decay for better regularization optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2) best_val_loss = float('inf') best_state = None for epoch in range(epochs): model.train() train_loss = 0.0 for X_batch, y_batch in train_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) optimizer.zero_grad() preds = model(X_batch) loss = criterion(preds, y_batch) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() train_loss += loss.item() * len(X_batch) train_loss /= len(train_loader.dataset) model.eval() val_loss = 0.0 with torch.no_grad(): for X_batch, y_batch in val_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) preds = model(X_batch) loss = criterion(preds, y_batch) val_loss += loss.item() * len(X_batch) if len(val_loader.dataset) > 0: val_loss /= len(val_loader.dataset) else: val_loss = train_loss scheduler.step(val_loss) if val_loss < best_val_loss: best_val_loss = val_loss best_state = {k: v.cpu() for k, v in model.state_dict().items()} if best_state is not None: model.load_state_dict(best_state) return model, train_dataset.scaler def predict_transformer(model, scaler, features_dict, seq_len=60, device=None): """ Infers the latest prediction for each asset using the trained Transformer. """ if not HAS_TORCH or model is None: return {} if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) model.eval() predictions = {} with torch.no_grad(): for t, df in features_dict.items(): if len(df) < seq_len: continue # Get the latest `seq_len` window recent_feats = df.drop(columns=['target', 'ret'], errors='ignore').values[-seq_len:] if scaler is not None: recent_feats = scaler.transform(recent_feats) X_tensor = torch.tensor(recent_feats, dtype=torch.float32).unsqueeze(0).to(device) # (1, seq_len, features) pred = model(X_tensor).item() predictions[t] = pred return predictions