Add DCDE: Depth-Conditioned Dynamic Ensemble architecture

0930e10 verified 10 days ago

29.7 kB

	"""
	=============================================================================
	DCDE: Depth-Conditioned Dynamic Ensemble with Evidential Uncertainty
	for Femtosecond Laser Internal Hydrogel Etching Prediction

	A novel hybrid architecture combining:
	1. FiLM-conditioned Neural Network (depth-adaptive feature modulation)
	2. XGBoost gradient-boosted trees (capturing tabular feature interactions)
	3. Learned dynamic gating network (input-conditioned fusion)
	4. Evidential Deep Learning (Normal-Inverse-Gamma uncertainty)
	5. Physics-informed regularization (monotonicity + energy constraints)

	References:
	- FiLM: Perez et al., AAAI 2018 (arxiv:1709.07871)
	- Deep Evidential Regression: Amini et al., NeurIPS 2020 (arxiv:1910.02600)
	- DELE gating: AAAI 2023 (arxiv:2302.00932)
	- Physics-informed ML: Zhang et al. 2022 (arxiv:2211.08064)
	=============================================================================
	"""

	from __future__ import annotations

	import math
	from typing import Dict, List, Optional, Tuple

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# =============================================================================
	# 1. PHYSICS-INFORMED FEATURE ENGINEERING (Depth-Dependent)
	# =============================================================================

	class DepthPhysicsFeatures:
	"""
	Compute analytically-derived physics features that encode how
	femtosecond laser behavior changes with focusing depth in hydrogels.

	These features capture three primary depth-dependent effects:
	1. Spherical aberration (Strehl ratio degradation)
	2. Group velocity dispersion (pulse temporal broadening)
	3. Self-focusing proximity (Kerr nonlinearity regime)

	Scientific basis:
	- Vogel et al., Applied Physics B (2005) - fs-laser tissue interaction
	- Schaffer et al., Optics Letters (2001) - bulk modification thresholds
	- Boyd, Nonlinear Optics (2020) - self-focusing, GVD theory
	"""

	def __init__(
	self,
	n_medium: float = 1.34, # Refractive index of hydrogel
	beta2_fs2_mm: float = 55.0, # GVD parameter (fs²/mm) for water-like medium
	n2_m2_W: float = 2.0e-20, # Nonlinear refractive index (m²/W)
	):
	self.n_medium = n_medium
	self.beta2 = beta2_fs2_mm * 1e-30 / 1e-3 # Convert to s²/m
	self.n2 = n2_m2_W

	def compute(
	self,
	focusing_depth_um: np.ndarray,
	pulse_duration_fs: np.ndarray,
	wavelength_nm: np.ndarray,
	NA: np.ndarray,
	power_mW: np.ndarray,
	rep_rate_kHz: np.ndarray,
	) -> np.ndarray:
	"""
	Compute physics features from raw parameters.

	Returns array of shape (N, 5) with columns:
	[strehl_ratio, intensity_factor, z_normalized, self_focus_ratio, depth_aberration]
	"""
	z = np.asarray(focusing_depth_um) * 1e-6 # µm → m
	tau0 = np.asarray(pulse_duration_fs) * 1e-15 # fs → s
	lam = np.asarray(wavelength_nm) * 1e-9 # nm → m
	na = np.asarray(NA)
	P_avg = np.asarray(power_mW) * 1e-3 # mW → W
	f_rep = np.asarray(rep_rate_kHz) * 1e3 # kHz → Hz

	# 1. Strehl ratio: S(z) = exp(-(2π·Δn·z·NA²/λ)²)
	# Quantifies how much aberration degrades the focal spot
	delta_n = self.n_medium - 1.0 # Air-hydrogel RI mismatch
	strehl = np.exp(-((2 * np.pi * delta_n * z * na2) / lam)2)
	strehl = np.clip(strehl, 1e-6, 1.0)

	# 2. GVD pulse broadening: τ(z) = τ₀·√(1 + (z/L_D)²)
	# Reduced peak intensity at depth
	L_D = tau0**2 / np.abs(self.beta2) # Dispersion length
	tau_z = tau0 * np.sqrt(1 + (z / np.maximum(L_D, 1e-10))**2)
	intensity_factor = tau0 / np.maximum(tau_z, tau0) # ∈ (0, 1]

	# 3. Normalized depth (relative to Rayleigh range)
	# Indicates when geometric vs. wave-optical effects dominate
	w0 = lam / (np.pi * np.maximum(na, 0.01)) # Beam waist
	z_rayleigh = np.pi * w0**2 / lam
	z_normalized = z / np.maximum(z_rayleigh, 1e-10)

	# 4. Self-focusing proximity: P_peak / P_critical
	# When > 1: catastrophic self-focusing regime
	P_peak = P_avg / (f_rep * tau0) # Peak power per pulse
	P_cr = 3.77 * lam*2 / (8 np.pi * self.n_medium * self.n2)
	sf_ratio = P_peak / np.maximum(P_cr, 1e-10)
	sf_ratio = np.clip(sf_ratio, 0, 50) # Cap at 50× critical

	# 5. Depth-dependent aberration parameter
	# Combined effect: how much the focal volume degrades with depth
	depth_aberration = delta_n * z * na**2 / lam

	return np.column_stack([
	strehl,
	intensity_factor,
	z_normalized,
	sf_ratio,
	depth_aberration,
	]).astype(np.float32)

	@property
	def feature_names(self) -> List[str]:
	return [
	"strehl_ratio",
	"intensity_factor_gvd",
	"z_normalized_rayleigh",
	"self_focusing_ratio",
	"depth_aberration_param",
	]


	# =============================================================================
	# 2. FiLM-CONDITIONED NEURAL NETWORK (Depth-Adaptive)
	# =============================================================================

	class FiLMGenerator(nn.Module):
	"""
	Feature-wise Linear Modulation (FiLM) generator.

	Maps conditioning input (depth features) to per-layer (γ, β) pairs
	that modulate hidden representations: h' = γ ⊙ h + β

	Uses the Δγ initialization trick: γ = 1 + Δγ for stable training
	(identity modulation at initialization).

	Reference: Perez et al., "FiLM: Visual Reasoning with a General
	Conditioning Layer", AAAI 2018.
	"""

	def __init__(self, conditioning_dim: int, hidden_dims: List[int]):
	super().__init__()
	self.generators = nn.ModuleList()

	for h_dim in hidden_dims:
	self.generators.append(
	nn.Sequential(
	nn.Linear(conditioning_dim, 64),
	nn.SiLU(),
	nn.Linear(64, h_dim * 2), # γ and β
	)
	)

	# Initialize near identity (Δγ ≈ 0, β ≈ 0)
	for gen in self.generators:
	nn.init.zeros_(gen[-1].weight)
	nn.init.zeros_(gen[-1].bias)

	def forward(self, conditioning: torch.Tensor) -> List[Tuple[torch.Tensor, torch.Tensor]]:
	"""
	Parameters
	----------
	conditioning : Tensor, shape (B, conditioning_dim)
	Depth-related features for conditioning

	Returns
	-------
	list of (gamma, beta) tuples for each layer
	"""
	film_params = []
	for gen in self.generators:
	params = gen(conditioning)
	h_dim = params.shape[-1] // 2
	delta_gamma = params[:, :h_dim]
	beta = params[:, h_dim:]
	gamma = 1.0 + delta_gamma # Δγ trick
	film_params.append((gamma, beta))
	return film_params


	class FiLMConditionedMLP(nn.Module):
	"""
	Multi-layer perceptron with FiLM conditioning at each hidden layer.

	Architecture:
	Input → [Linear → BatchNorm → FiLM(γ,β) → SiLU → Dropout] × L → Output

	The FiLM conditioning allows depth information to modulate the network's
	intermediate representations multiplicatively, enabling fundamentally
	different processing depending on focusing depth — not just adding depth
	as another input feature.
	"""

	def __init__(
	self,
	input_dim: int,
	hidden_dims: List[int],
	output_dim: int,
	conditioning_dim: int,
	dropout: float = 0.15,
	):
	super().__init__()
	self.hidden_dims = hidden_dims

	# Build layers
	dims = [input_dim] + hidden_dims
	self.layers = nn.ModuleList([
	nn.Linear(d_in, d_out) for d_in, d_out in zip(dims[:-1], dims[1:])
	])
	self.batch_norms = nn.ModuleList([
	nn.BatchNorm1d(d) for d in hidden_dims
	])
	self.dropouts = nn.ModuleList([
	nn.Dropout(dropout * (1 - i / len(hidden_dims)))
	for i in range(len(hidden_dims))
	])

	# FiLM generator (depth → modulation parameters)
	self.film_generator = FiLMGenerator(conditioning_dim, hidden_dims)

	# Output projection
	self.output_layer = nn.Linear(hidden_dims[-1], output_dim)

	def forward(
	self,
	x: torch.Tensor,
	conditioning: torch.Tensor,
	) -> torch.Tensor:
	"""
	Parameters
	----------
	x : Tensor (B, input_dim) - laser + material features
	conditioning : Tensor (B, conditioning_dim) - depth physics features

	Returns
	-------
	Tensor (B, output_dim) - latent representation
	"""
	# Get FiLM parameters for all layers
	film_params = self.film_generator(conditioning)

	h = x
	for i, (layer, bn, dropout) in enumerate(
	zip(self.layers, self.batch_norms, self.dropouts)
	):
	h = layer(h)
	h = bn(h)
	# Apply FiLM modulation
	gamma, beta = film_params[i]
	h = gamma * h + beta
	h = F.silu(h)
	h = dropout(h)

	return self.output_layer(h)


	# =============================================================================
	# 3. EVIDENTIAL REGRESSION HEAD (Normal-Inverse-Gamma)
	# =============================================================================

	class EvidentialHead(nn.Module):
	"""
	Normal-Inverse-Gamma (NIG) evidential regression head.

	Outputs four parameters per target that parameterize a NIG distribution,
	providing both aleatoric and epistemic uncertainty estimates in a single
	forward pass (no ensemble or MC dropout required).

	For each output dimension:
	μ ~ N(γ, σ²/ν) [predictive mean with epistemic noise]
	σ² ~ InvGamma(α, β) [aleatoric variance]

	Uncertainty decomposition:
	Aleatoric: E[σ²] = β / (α - 1)
	Epistemic: Var[μ] = β / (ν(α - 1))

	Reference: Amini et al., "Deep Evidential Regression", NeurIPS 2020.
	"""

	def __init__(self, input_dim: int, n_outputs: int):
	super().__init__()
	self.n_outputs = n_outputs
	# Output: 4 parameters per target (γ, ν, α, β)
	self.fc = nn.Linear(input_dim, n_outputs * 4)

	# Initialize carefully for stable NIG parameters
	nn.init.xavier_normal_(self.fc.weight, gain=0.1)
	nn.init.zeros_(self.fc.bias)

	def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
	"""
	Returns
	-------
	gamma : Tensor (B, n_outputs) - predictive mean
	nu : Tensor (B, n_outputs) - evidence for mean (>0)
	alpha : Tensor (B, n_outputs) - evidence for variance (>1)
	beta : Tensor (B, n_outputs) - scale for variance (>0)
	"""
	out = self.fc(x).reshape(-1, self.n_outputs, 4)

	gamma = out[..., 0]
	nu = F.softplus(out[..., 1]) + 1e-6 # ν > 0
	alpha = F.softplus(out[..., 2]) + 1.0 + 1e-6 # α > 1
	beta = F.softplus(out[..., 3]) + 1e-6 # β > 0

	return gamma, nu, alpha, beta

	@staticmethod
	def aleatoric_uncertainty(alpha: torch.Tensor, beta: torch.Tensor) -> torch.Tensor:
	"""E[σ²] = β / (α - 1)"""
	return beta / (alpha - 1.0).clamp(min=1e-6)

	@staticmethod
	def epistemic_uncertainty(nu: torch.Tensor, alpha: torch.Tensor, beta: torch.Tensor) -> torch.Tensor:
	"""Var[μ] = β / (ν(α - 1))"""
	return beta / (nu * (alpha - 1.0).clamp(min=1e-6))


	# =============================================================================
	# 4. DEPTH-CONDITIONED GATING NETWORK (Learned Dynamic Fusion)
	# =============================================================================

	class DepthConditionedGatingNetwork(nn.Module):
	"""
	Input-conditioned gating network that dynamically determines how to
	fuse XGBoost and Neural Network predictions.

	Unlike a fixed 60/40 weighting, this network learns WHEN each expert
	is more reliable — conditioned on both input features and focusing depth.

	Key insight from DELE (arxiv:2302.00932): the gating network benefits
	from seeing the same features as the experts, plus the experts' own
	predictions as additional input.

	Architecture:
	[input_features ⊕ depth_physics ⊕ expert_predictions] → MLP → softmax(2)
	"""

	def __init__(
	self,
	input_dim: int,
	depth_dim: int,
	n_expert_outputs: int,
	n_experts: int = 2,
	hidden_dim: int = 64,
	):
	super().__init__()
	total_input = input_dim + depth_dim + n_expert_outputs * n_experts

	self.gate = nn.Sequential(
	nn.Linear(total_input, hidden_dim),
	nn.SiLU(),
	nn.Dropout(0.1),
	nn.Linear(hidden_dim, hidden_dim // 2),
	nn.SiLU(),
	nn.Linear(hidden_dim // 2, n_experts),
	)

	# Temperature parameter (learnable) for softmax sharpness
	self.temperature = nn.Parameter(torch.ones(1))

	def forward(
	self,
	features: torch.Tensor,
	depth_physics: torch.Tensor,
	expert_preds: List[torch.Tensor],
	) -> torch.Tensor:
	"""
	Parameters
	----------
	features : Tensor (B, input_dim)
	depth_physics : Tensor (B, depth_dim)
	expert_preds : list of Tensor (B, n_outputs) per expert

	Returns
	-------
	weights : Tensor (B, n_experts) - softmax weights summing to 1
	"""
	gate_input = torch.cat(
	[features, depth_physics] + expert_preds, dim=-1
	)
	logits = self.gate(gate_input) / self.temperature.clamp(min=0.1)
	return F.softmax(logits, dim=-1)


	# =============================================================================
	# 5. COMPLETE DCDE MODEL
	# =============================================================================

	class DCDE(nn.Module):
	"""
	Depth-Conditioned Dynamic Ensemble (DCDE)

	A hybrid architecture for predicting femtosecond laser internal etching
	geometry in hydrogels. Combines:

	1. XGBoost branch: Pre-trained gradient-boosted trees capturing
	complex tabular feature interactions (frozen during DCDE training)

	2. FiLM-NN branch: Depth-conditioned neural network where focusing
	depth modulates intermediate representations via FiLM layers

	3. Dynamic gating: Input-conditioned fusion network that learns
	optimal weighting between branches depending on input regime

	4. Evidential head: NIG distribution output providing calibrated
	aleatoric + epistemic uncertainty

	5. Physics-informed loss: Soft monotonicity constraints and energy
	conservation regularization

	Training protocol (3-phase, following DELE):
	Phase 1: Train XGBoost independently on tabular features
	Phase 2: Train FiLM-NN with evidential head (XGBoost frozen)
	Phase 3: Train gating network jointly (optionally fine-tune FiLM-NN)
	"""

	def __init__(
	self,
	input_dim: int,
	depth_physics_dim: int = 5,
	hidden_dims: List[int] = [128, 96, 64],
	n_outputs: int = 5,
	n_experts: int = 2,
	gating_hidden: int = 64,
	):
	super().__init__()
	self.input_dim = input_dim
	self.n_outputs = n_outputs

	# FiLM-conditioned NN branch
	self.film_nn = FiLMConditionedMLP(
	input_dim=input_dim,
	hidden_dims=hidden_dims,
	output_dim=hidden_dims[-1],
	conditioning_dim=depth_physics_dim,
	)

	# XGBoost prediction embedding (projects XGB outputs to latent space)
	self.xgb_embed = nn.Sequential(
	nn.Linear(n_outputs, hidden_dims[-1]),
	nn.SiLU(),
	nn.Linear(hidden_dims[-1], hidden_dims[-1]),
	)

	# Gating network
	self.gating = DepthConditionedGatingNetwork(
	input_dim=input_dim,
	depth_dim=depth_physics_dim,
	n_expert_outputs=n_outputs,
	n_experts=n_experts,
	hidden_dim=gating_hidden,
	)

	# Evidential head (NIG parameters)
	self.evidential_head = EvidentialHead(hidden_dims[-1], n_outputs)

	# Direct output head for XGBoost branch (for gating comparison)
	self.xgb_output = nn.Linear(hidden_dims[-1], n_outputs)

	def forward(
	self,
	features: torch.Tensor,
	depth_physics: torch.Tensor,
	xgb_predictions: torch.Tensor,
	) -> Dict[str, torch.Tensor]:
	"""
	Parameters
	----------
	features : Tensor (B, input_dim) - all input features
	depth_physics : Tensor (B, depth_physics_dim) - computed physics features
	xgb_predictions : Tensor (B, n_outputs) - pre-computed XGBoost predictions

	Returns
	-------
	dict with keys:
	'gamma' : predictive mean (B, n_outputs)
	'nu', 'alpha', 'beta' : NIG parameters
	'aleatoric_unc' : aleatoric uncertainty
	'epistemic_unc' : epistemic uncertainty
	'gate_weights' : expert weights (B, 2)
	'nn_pred' : raw NN branch prediction
	'xgb_pred' : embedded XGBoost prediction
	"""
	# NN branch: depth-conditioned via FiLM
	nn_latent = self.film_nn(features, depth_physics)

	# XGBoost branch: embed predictions into latent space
	xgb_latent = self.xgb_embed(xgb_predictions)

	# Compute intermediate predictions for gating input
	nn_pred_raw = self.evidential_head(nn_latent)[0] # Just gamma

	# Dynamic gating: determine expert weights
	gate_weights = self.gating(
	features, depth_physics,
	[xgb_predictions, nn_pred_raw.detach()] # Detach to avoid circular gradients
	)

	# Fused latent representation
	w_xgb = gate_weights[:, 0:1] # (B, 1)
	w_nn = gate_weights[:, 1:2] # (B, 1)
	fused_latent = w_xgb * xgb_latent + w_nn * nn_latent

	# Evidential output
	gamma, nu, alpha, beta = self.evidential_head(fused_latent)

	# Uncertainty decomposition
	aleatoric = EvidentialHead.aleatoric_uncertainty(alpha, beta)
	epistemic = EvidentialHead.epistemic_uncertainty(nu, alpha, beta)

	return {
	"gamma": gamma,
	"nu": nu,
	"alpha": alpha,
	"beta": beta,
	"aleatoric_unc": aleatoric,
	"epistemic_unc": epistemic,
	"gate_weights": gate_weights,
	"nn_pred": nn_pred_raw,
	"xgb_pred": xgb_predictions,
	}


	# =============================================================================
	# 6. LOSS FUNCTIONS (NIG + Physics-Informed)
	# =============================================================================

	class DCDELoss(nn.Module):
	"""
	Composite loss for DCDE training:

	L_total = L_NIG + λ_mono·L_monotonicity + λ_energy·L_energy + λ_gate·L_gate_entropy

	Components:
	1. NIG Loss (evidential regression) - primary data fitting
	2. Monotonicity loss - enforces physical depth-etch relationships
	3. Energy conservation - volume scales with deposited energy
	4. Gate entropy regularization - prevents degenerate gating
	"""

	def __init__(
	self,
	lambda_nig_reg: float = 0.01,
	lambda_mono: float = 0.05,
	lambda_energy: float = 0.02,
	lambda_gate: float = 0.01,
	depth_feature_idx: int = -1,
	power_feature_idx: int = 0,
	):
	super().__init__()
	self.lambda_nig_reg = lambda_nig_reg
	self.lambda_mono = lambda_mono
	self.lambda_energy = lambda_energy
	self.lambda_gate = lambda_gate
	self.depth_idx = depth_feature_idx
	self.power_idx = power_feature_idx

	def nig_loss(
	self,
	y: torch.Tensor,
	gamma: torch.Tensor,
	nu: torch.Tensor,
	alpha: torch.Tensor,
	beta: torch.Tensor,
	) -> torch.Tensor:
	"""
	Normal-Inverse-Gamma negative log-likelihood with evidence regularization.

	L = L_NLL + λ·L_evidence_regularization

	The regularization penalizes high evidence (ν, α) when the prediction
	is wrong, encouraging the model to be uncertain when inaccurate.
	"""
	# NLL term
	omega = 2 * beta * (1 + nu)
	nll = (
	0.5 * torch.log(torch.pi / nu.clamp(min=1e-6))
	- alpha * torch.log(omega.clamp(min=1e-10))
	+ (alpha + 0.5) * torch.log(
	((y - gamma) ** 2 * nu + omega).clamp(min=1e-10)
	)
	+ torch.lgamma(alpha) - torch.lgamma(alpha + 0.5)
	)

	# Evidence regularization (penalize evidence when wrong)
	error = torch.abs(y - gamma)
	evidence = 2 * nu + alpha
	reg = error * evidence

	return (nll + self.lambda_nig_reg * reg).mean()

	def monotonicity_loss(
	self,
	features: torch.Tensor,
	gamma: torch.Tensor,
	model: nn.Module,
	depth_physics: torch.Tensor,
	xgb_pred: torch.Tensor,
	) -> torch.Tensor:
	"""
	Soft monotonicity constraint: for most targets, increasing laser
	parameters (power, passes) at fixed depth should not decrease output.

	Specifically for depth etching:
	- More passes → deeper etch (target 0: etch_depth)
	- Higher fluence → wider etch (target 1: etch_width)

	Implemented as finite-difference gradient penalty.
	"""
	# Perturb power upward by small amount
	features_perturbed = features.clone()
	features_perturbed[:, self.power_idx] = features[:, self.power_idx] * 1.05

	# Get predictions for perturbed input
	with torch.no_grad():
	output_perturbed = model(features_perturbed, depth_physics, xgb_pred)

	# Depth and width should increase with power (soft constraint)
	# Only penalize violations (relu of negative gradient)
	violation_depth = F.relu(gamma[:, 0] - output_perturbed["gamma"][:, 0])
	violation_width = F.relu(gamma[:, 1] - output_perturbed["gamma"][:, 1])

	return (violation_depth.mean() + violation_width.mean()) / 2

	def energy_conservation_loss(
	self,
	features: torch.Tensor,
	gamma: torch.Tensor,
	) -> torch.Tensor:
	"""
	Soft energy constraint: predicted ablated volume should correlate
	positively with deposited energy.

	Volume proxy ∝ depth × width²
	Energy proxy ∝ power × (num_passes / scan_speed)

	We penalize anti-correlation (negative cosine similarity).
	"""
	# Volume proxy from predictions
	depth_pred = gamma[:, 0].clamp(min=0)
	width_pred = gamma[:, 1].clamp(min=0)
	volume_proxy = depth_pred * width_pred ** 2

	# Energy proxy from inputs
	power = features[:, self.power_idx].clamp(min=1e-6)
	energy_proxy = power # Simplified; could include scan speed, passes

	# Penalize negative correlation
	# Cosine similarity should be positive
	cos_sim = F.cosine_similarity(
	volume_proxy.unsqueeze(-1),
	energy_proxy.unsqueeze(-1),
	dim=0,
	)
	return F.relu(-cos_sim).mean()

	def gate_entropy_loss(self, gate_weights: torch.Tensor) -> torch.Tensor:
	"""
	Encourage non-degenerate gating (not always choosing one expert).
	Maximize entropy of gate weights (encourage exploration).
	Penalize when one weight is always 0 or 1.
	"""
	# Per-sample entropy
	entropy = -(gate_weights * torch.log(gate_weights + 1e-8)).sum(dim=-1)
	# Maximize entropy → minimize negative entropy
	max_entropy = math.log(gate_weights.shape[-1])
	return (max_entropy - entropy.mean())

	def forward(
	self,
	y: torch.Tensor,
	model_output: Dict[str, torch.Tensor],
	features: torch.Tensor,
	depth_physics: torch.Tensor,
	model: Optional[nn.Module] = None,
	) -> Dict[str, torch.Tensor]:
	"""
	Compute total loss with all components.

	Returns dict with individual loss components for logging.
	"""
	gamma = model_output["gamma"]
	nu = model_output["nu"]
	alpha = model_output["alpha"]
	beta = model_output["beta"]
	gate_weights = model_output["gate_weights"]
	xgb_pred = model_output["xgb_pred"]

	# Primary loss: NIG
	l_nig = self.nig_loss(y, gamma, nu, alpha, beta)

	# Physics losses
	l_mono = torch.tensor(0.0, device=y.device)
	if model is not None and self.lambda_mono > 0:
	l_mono = self.monotonicity_loss(features, gamma, model, depth_physics, xgb_pred)

	l_energy = torch.tensor(0.0, device=y.device)
	if self.lambda_energy > 0:
	l_energy = self.energy_conservation_loss(features, gamma)

	# Gating regularization
	l_gate = self.gate_entropy_loss(gate_weights)

	# Total
	total = (
	l_nig
	+ self.lambda_mono * l_mono
	+ self.lambda_energy * l_energy
	+ self.lambda_gate * l_gate
	)

	return {
	"total": total,
	"nig": l_nig,
	"monotonicity": l_mono,
	"energy": l_energy,
	"gate_entropy": l_gate,
	}


	# =============================================================================
	# 7. TRAINING UTILITIES
	# =============================================================================

	class DCDETrainer:
	"""
	Three-phase training protocol for DCDE.

	Phase 1: Train XGBoost on tabular features (external, uses sklearn/xgboost)
	Phase 2: Train FiLM-NN with evidential head (XGBoost predictions as input)
	Phase 3: Train gating network + fine-tune FiLM-NN end-to-end
	"""

	def __init__(
	self,
	model: DCDE,
	loss_fn: DCDELoss,
	lr_phase2: float = 1e-3,
	lr_phase3: float = 3e-4,
	weight_decay: float = 1e-4,
	device: str = "cpu",
	):
	self.model = model.to(device)
	self.loss_fn = loss_fn
	self.lr_phase2 = lr_phase2
	self.lr_phase3 = lr_phase3
	self.weight_decay = weight_decay
	self.device = device

	def phase2_train_step(
	self,
	features: torch.Tensor,
	depth_physics: torch.Tensor,
	xgb_predictions: torch.Tensor,
	targets: torch.Tensor,
	optimizer: torch.optim.Optimizer,
	) -> Dict[str, float]:
	"""Single training step for Phase 2 (FiLM-NN + evidential head)."""
	self.model.train()
	optimizer.zero_grad()

	output = self.model(features, depth_physics, xgb_predictions)
	losses = self.loss_fn(targets, output, features, depth_physics, self.model)

	losses["total"].backward()
	torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
	optimizer.step()

	return {k: v.item() for k, v in losses.items()}

	def phase3_train_step(
	self,
	features: torch.Tensor,
	depth_physics: torch.Tensor,
	xgb_predictions: torch.Tensor,
	targets: torch.Tensor,
	optimizer: torch.optim.Optimizer,
	) -> Dict[str, float]:
	"""Single training step for Phase 3 (end-to-end with gating)."""
	# Same as phase 2 but with different learning rate and all params unfrozen
	return self.phase2_train_step(features, depth_physics, xgb_predictions, targets, optimizer)

	@torch.no_grad()
	def predict(
	self,
	features: torch.Tensor,
	depth_physics: torch.Tensor,
	xgb_predictions: torch.Tensor,
	) -> Dict[str, np.ndarray]:
	"""
	Inference with uncertainty quantification.

	Returns
	-------
	dict with:
	'mean': predicted values (B, n_outputs)
	'aleatoric_unc': aleatoric uncertainty per target
	'epistemic_unc': epistemic uncertainty per target
	'total_unc': total predictive uncertainty
	'gate_weights': expert weights showing XGB vs NN dominance
	"""
	self.model.eval()
	output = self.model(features, depth_physics, xgb_predictions)

	return {
	"mean": output["gamma"].cpu().numpy(),
	"aleatoric_unc": output["aleatoric_unc"].cpu().numpy(),
	"epistemic_unc": output["epistemic_unc"].cpu().numpy(),
	"total_unc": (output["aleatoric_unc"] + output["epistemic_unc"]).cpu().numpy(),
	"gate_weights": output["gate_weights"].cpu().numpy(),
	}