""" End-to-End Portfolio Forecast Model ===================================== A neural network that learns to predict expected returns optimised for *portfolio construction quality*, not forecast accuracy. Architecture ------------ Raw Features → Per-Asset Encoder (shared MLP) → Cross-Sectional Multi-Head Attention → μ_predicted + vol_scale → DifferentiablePortfolioLayer → w* → Loss = −Sharpe(w*, realised) | SPO+ regret The cross-asset attention is the critical structural choice: it lets AAPL's features affect TLT's predicted return because the downstream optimizer cares about covariance, not just individual forecasts. Papers ------ - Amos & Kolter (2017), OptNet. - Agrawal et al. (2019), cvxpylayers. - Elmachtoub & Grigas (2022), Smart Predict-then-Optimize (SPO+). """ import os import hashlib import pickle import numpy as np import pandas as pd try: import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset _TORCH_AVAILABLE = True except ImportError: _TORCH_AVAILABLE = False class _MockTorch: Tensor = object torch = _MockTorch() class _MockNN: Module = object nn = _MockNN() DataLoader = object TensorDataset = object from typing import Tuple, Dict, Optional from config import logger, Color if _TORCH_AVAILABLE: from differentiable_optimizer import DifferentiablePortfolioLayer else: DifferentiablePortfolioLayer = None # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # 1. NEURAL ARCHITECTURE # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ class PortfolioForecastNetwork(nn.Module): """ Predicts per-asset expected returns optimised for decision quality. The key insight: the loss function is portfolio Sharpe ratio (or SPO+ regret), *not* MSE. Gradients flow backward through cvxpylayers into the network weights. """ def __init__(self, n_assets: int, n_features: int, hidden_dim: int = 64): super().__init__() self.n_assets = n_assets # ── Per-asset temporal encoder (shared weights) ── self.asset_encoder = nn.Sequential( nn.Linear(n_features, hidden_dim), nn.LayerNorm(hidden_dim), nn.GELU(), nn.Dropout(0.1), nn.Linear(hidden_dim, hidden_dim // 2), nn.GELU(), ) # ── Cross-sectional attention ── # Assets "talk" to each other so that predicting high AAPL return # when MSFT is also high leads to diversification-aware weights. self.cross_asset_attention = nn.MultiheadAttention( embed_dim=hidden_dim // 2, num_heads=4, dropout=0.1, batch_first=True, ) # ── Output heads ── self.return_head = nn.Linear(hidden_dim // 2, 1) # Diagonal volatility scaling (Softplus → positive) self.vol_adjustment = nn.Sequential( nn.Linear(hidden_dim // 2, 1), nn.Softplus(), ) def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Args ---- x : (batch, n_assets, n_features) Returns ------- mu : (batch, n_assets) predicted expected returns vol_scale : (batch, n_assets) predicted volatility scaling factors """ # Encode each asset independently (shared weights) encoded = self.asset_encoder(x) # (B, N, H/2) # Cross-asset attention + residual attended, _ = self.cross_asset_attention(encoded, encoded, encoded) attended = attended + encoded # (B, N, H/2) mu = self.return_head(attended).squeeze(-1) # (B, N) vol_scale = self.vol_adjustment(attended).squeeze(-1) # (B, N) return mu, vol_scale # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # 2. TRAINER # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ class E2EPortfolioTrainer: """ Trains ``PortfolioForecastNetwork`` using portfolio performance as the loss function, back-propagating through the differentiable optimisation layer. Loss functions -------------- sharpe : maximise Sharpe ratio spo : Smart Predict-then-Optimize+ regret (Elmachtoub & Grigas 2022) """ def __init__(self, n_assets: int, n_features: int, risk_factor: float = 3.0, loss_type: str = "spo", hidden_dim: int = 64, lr: float = 1e-3, device: str = "cpu"): self.device = torch.device(device) self.loss_type = loss_type self.n_assets = n_assets self.n_features = n_features self.forecast_net = PortfolioForecastNetwork( n_assets, n_features, hidden_dim, ).to(self.device) self.opt_layer = DifferentiablePortfolioLayer( n_assets, risk_factor, ) self.optimizer = torch.optim.AdamW( self.forecast_net.parameters(), lr=lr, weight_decay=1e-4, ) self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.optimizer, T_max=100, ) # ------------------------------------------------------------------ # # Loss functions # # ------------------------------------------------------------------ # def _portfolio_sharpe_loss( self, weights: torch.Tensor, realized_returns: torch.Tensor, rfr: float = 0.04, trading_days: int = 252, ) -> torch.Tensor: """ Negative Sharpe ratio as training loss. weights : (batch, n_assets) realized_returns : (batch, horizon, n_assets) """ # Portfolio return at each step in the horizon (B, H) port_rets = (realized_returns * weights.unsqueeze(1)).sum(dim=-1) daily_rfr = rfr / trading_days excess = port_rets - daily_rfr mean_excess = excess.mean(dim=1) std_excess = excess.std(dim=1) + 1e-8 sharpe = mean_excess / std_excess return -sharpe.mean() def _spo_loss( self, predicted_mu: torch.Tensor, realized_returns: torch.Tensor, L: torch.Tensor, ) -> torch.Tensor: """ SPO+ regret loss (Elmachtoub & Grigas 2022). Penalises the gap between the realised portfolio value achieved by predicted weights vs. the oracle weights (known only in hindsight). """ # Weights from predicted returns — routed through forward() so solver # args stay consistent with any future changes to the opt layer. predicted_weights = self.opt_layer(predicted_mu, L) # Oracle weights — from realised returns (available only in training) realized_mean = realized_returns.mean(dim=1).detach() with torch.no_grad(): oracle_weights, = self.opt_layer.layer( realized_mean, L, solver_args={"solve_method": "SCS", "eps": 1e-4, "max_iters": 5000}, ) # Decision regret oracle_value = (oracle_weights * realized_mean).sum(dim=-1) predicted_value = (predicted_weights * realized_mean).sum(dim=-1) regret = oracle_value - predicted_value return regret.mean() # ------------------------------------------------------------------ # # Data preparation # # ------------------------------------------------------------------ # def build_feature_tensors( self, features_dict: Dict[str, pd.DataFrame], returns_df: pd.DataFrame, horizon: int = 21, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, list, pd.DatetimeIndex]: """ Converts the existing ``features_dict`` from ``data.build_ml_features()`` into aligned tensors for batch training. Returns ------- X : (T, n_assets, n_features) y : (T, horizon, n_assets) realised forward returns cov_L : (T, n_assets, n_assets) Cholesky of rolling EWMA covariance tickers : list common_idx : DatetimeIndex """ tickers = list(features_dict.keys()) n_assets = len(tickers) # Align time indices across all assets all_dfs = {t: features_dict[t].dropna() for t in tickers} common_idx = all_dfs[tickers[0]].index for t in tickers[1:]: common_idx = common_idx.intersection(all_dfs[t].index) common_idx = common_idx.sort_values() feature_cols = [c for c in all_dfs[tickers[0]].columns if c not in ("target", "ret")] n_features = len(feature_cols) # ── Build X and y arrays ── X_list, y_list = [], [] for idx in common_idx: asset_feats = [] asset_targets = [] for t in tickers: row = all_dfs[t].loc[idx] asset_feats.append(row[feature_cols].values.astype(np.float32)) asset_targets.append(float(row.get("target", 0.0))) X_list.append(asset_feats) y_list.append(asset_targets) X = torch.tensor(np.array(X_list), dtype=torch.float32) # y is a single-step target; reshape to (T, 1, N) for horizon dim y = torch.tensor(np.array(y_list), dtype=torch.float32).unsqueeze(1) # ── Rolling Cholesky of EWMA covariance ── T = len(common_idx) cov_L = torch.zeros(T, n_assets, n_assets, dtype=torch.float32) # Use the actual returns aligned to this common index for EWMA cov aligned_rets = returns_df[tickers].reindex(common_idx).fillna(0.0) ewma_cov = aligned_rets.ewm(halflife=126).cov() for i, date in enumerate(common_idx): try: cov_slice = ewma_cov.loc[date].values.reshape(n_assets, n_assets) # Regularise + Cholesky cov_reg = cov_slice + np.eye(n_assets) * 1e-6 L_np = np.linalg.cholesky(cov_reg).astype(np.float32) cov_L[i] = torch.from_numpy(L_np) except (np.linalg.LinAlgError, KeyError, ValueError): cov_L[i] = torch.eye(n_assets, dtype=torch.float32) return X, y, cov_L, tickers, common_idx # ------------------------------------------------------------------ # # Training loop # # ------------------------------------------------------------------ # def train( self, features_dict: Dict[str, pd.DataFrame], returns_df: pd.DataFrame, n_epochs: int = 50, batch_size: int = 32, val_split: float = 0.2, rfr: float = 0.04, silent: bool = False, ) -> Dict[str, list]: """Full training loop. Returns history dict.""" X, y, cov_L, tickers, idx = self.build_feature_tensors( features_dict, returns_df, ) T = X.shape[0] if T < 10: logger.warning(f"E2E: Only {T} aligned samples — too few to train. " "Returning empty history.") return {"train_loss": [], "val_loss": [], "val_sharpe": []} train_end = int(T * (1 - val_split)) # Strict temporal split — no shuffling X_train, X_val = X[:train_end], X[train_end:] y_train, y_val = y[:train_end], y[train_end:] L_train, L_val = cov_L[:train_end], cov_L[train_end:] train_ds = TensorDataset(X_train, y_train, L_train) train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=False) history = {"train_loss": [], "val_loss": [], "val_sharpe": []} for epoch in range(n_epochs): self.forecast_net.train() epoch_loss = 0.0 n_batches = 0 for X_batch, y_batch, L_batch in train_loader: X_batch = X_batch.to(self.device) y_batch = y_batch.to(self.device) L_batch = L_batch.to(self.device) self.optimizer.zero_grad() # Forward: predict returns + vol scaling mu_pred, vol_scale = self.forecast_net(X_batch) # Scale Cholesky by predicted vol adjustment L_scaled = L_batch * vol_scale.unsqueeze(-1) # Differentiable portfolio weights try: weights = self.opt_layer(mu_pred, L_scaled) except Exception as e: logger.warning(f"E2E: solver failed in epoch {epoch}: {e}") continue # Loss if self.loss_type == "spo": loss = self._spo_loss(mu_pred, y_batch, L_scaled) else: # "sharpe" or fallback loss = self._portfolio_sharpe_loss(weights, y_batch, rfr) if torch.isnan(loss) or torch.isinf(loss): continue loss.backward() # Gradient clipping — critical for stability through opt layer torch.nn.utils.clip_grad_norm_( self.forecast_net.parameters(), max_norm=1.0, ) self.optimizer.step() epoch_loss += loss.item() n_batches += 1 self.scheduler.step() # ── Validation ── self.forecast_net.eval() with torch.no_grad(): mu_val, vol_val = self.forecast_net(X_val.to(self.device)) L_val_dev = L_val.to(self.device) L_val_scaled = L_val_dev * vol_val.unsqueeze(-1) try: w_val = self.opt_layer(mu_val, L_val_scaled) val_loss = self._portfolio_sharpe_loss( w_val, y_val.to(self.device), rfr, ).item() port_rets_val = (y_val.squeeze(1).to(self.device) * w_val).sum(-1) val_sharpe = float( (port_rets_val.mean() / (port_rets_val.std() + 1e-8)) * np.sqrt(252) ) except Exception: val_loss = float("nan") val_sharpe = float("nan") avg_train = epoch_loss / max(n_batches, 1) history["train_loss"].append(avg_train) history["val_loss"].append(val_loss) history["val_sharpe"].append(val_sharpe) if not silent and epoch % 10 == 0: print( f" E2E Epoch {epoch:3d} │ " f"Train Loss: {avg_train:+.4f} │ " f"Val Sharpe: {val_sharpe:+.3f}" ) return history # ------------------------------------------------------------------ # # Inference # # ------------------------------------------------------------------ # def predict( self, features_dict: Dict[str, pd.DataFrame], base_cov: pd.DataFrame, ) -> Tuple[pd.Series, pd.Series]: """ Run inference and return expected returns + suggested weights. Integrates with the existing ``OptimizationResult`` interface. """ self.forecast_net.eval() tickers = list(features_dict.keys()) n = len(tickers) # Build latest feature vector latest_features = [] feature_cols: Optional[list] = None for t in tickers: df = features_dict[t].dropna() if feature_cols is None: feature_cols = [c for c in df.columns if c not in ("target", "ret")] latest_features.append( df[feature_cols].iloc[-1].values.astype(np.float32) ) X = torch.tensor( np.array(latest_features), dtype=torch.float32, ).unsqueeze(0) # (1, N, F) # Cholesky of base covariance Sigma = base_cov.reindex(index=tickers, columns=tickers).fillna(0).values try: L_np = np.linalg.cholesky( Sigma + np.eye(n) * 1e-6 ).astype(np.float32) except np.linalg.LinAlgError: L_np = np.eye(n, dtype=np.float32) L = torch.tensor(L_np).unsqueeze(0) # (1, N, N) with torch.no_grad(): mu_pred, vol_scale = self.forecast_net(X) L_scaled = L * vol_scale.unsqueeze(-1) weights = self.opt_layer(mu_pred, L_scaled) mu_series = pd.Series(mu_pred.squeeze().numpy(), index=tickers) w_series = pd.Series(weights.squeeze().numpy(), index=tickers) return mu_series, w_series # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # 3. CACHE HELPERS # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ def _cache_key(tickers: list, n_obs: int, rfr: float) -> str: raw = f"{sorted(tickers)}_{n_obs}_{rfr:.6f}" return hashlib.md5(raw.encode()).hexdigest() def save_model(trainer: E2EPortfolioTrainer, cache_dir: str, tickers: list, n_obs: int, rfr: float) -> str: os.makedirs(cache_dir, exist_ok=True) key = _cache_key(tickers, n_obs, rfr) path = os.path.join(cache_dir, f"e2e_{key}.pkl") torch.save(trainer.forecast_net.state_dict(), path) return path def load_model(trainer: E2EPortfolioTrainer, cache_dir: str, tickers: list, n_obs: int, rfr: float) -> bool: key = _cache_key(tickers, n_obs, rfr) path = os.path.join(cache_dir, f"e2e_{key}.pkl") if os.path.exists(path): trainer.forecast_net.load_state_dict(torch.load(path, weights_only=True)) return True return False