baguette / src /models.py

Initial upload: Paris MoE inference code and weights

4dec1ca verified about 2 months ago

83.7 kB

	# src/models.py
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from diffusers import UNet2DModel
	from transformers import ViTForImageClassification, ViTConfig
	import math
	from typing import Optional, List
	import numpy as np

	# =============================================================================
	# TIME EMBEDDING (shared utility)
	# =============================================================================

	class TimeEmbedding(nn.Module):
	def __init__(self, dim: int) -> None:
	super().__init__()
	self.dim = dim

	def forward(self, t: torch.Tensor) -> torch.Tensor:
	device = t.device
	half_dim = self.dim // 2
	embeddings = math.log(10000) / (half_dim - 1)
	embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
	embeddings = t[:, None] * embeddings[None, :]
	embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
	return embeddings

	class DiTTimestepEmbedder(nn.Module):
	def __init__(self, hidden_size, freq_dim=128, max_period=10000):
	super().__init__()
	self.freq_dim = freq_dim
	self.max_period = max_period
	self.mlp = nn.Sequential(
	nn.Linear(2*freq_dim, hidden_size, bias=True),
	nn.SiLU(),
	nn.Linear(hidden_size, hidden_size, bias=True),
	)
	def forward(self, t): # t: [B] integers (float tensor ok)
	# standard "timestep_embedding" (like ADM/DiT)
	half = self.freq_dim
	device = t.device
	# positions in radians
	freqs = torch.exp(
	-torch.arange(half, device=device).float() * np.log(self.max_period) / half
	)
	args = t.float()[:, None] * freqs[None] # [B, half]
	emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) # [B, 2*half]
	return self.mlp(emb)

	# =============================================================================
	# OUTPUT CONVERTER (for heterogeneous objectives)
	# =============================================================================

	class OutputConverter(nn.Module):
	def __init__(self, schedule_type: str = 'linear_interp', use_latents: bool = False, derivative_eps: float = 1e-4):
	super().__init__()
	from schedules import NoiseSchedule
	self.schedule = NoiseSchedule(schedule_type)
	self.schedule_type = schedule_type
	self.use_latents = use_latents
	self.derivative_eps = derivative_eps # For finite difference derivatives

	# Set clamping range based on data type
	# VAE latents have larger range than pixel-space images
	self.clamp_range = 20.0 if use_latents else 5.0

	def _get_schedule_with_derivatives(self, t: torch.Tensor):
	"""
	Compute schedule coefficients and their derivatives.
	Essential for correct velocity computation with any schedule.
	"""
	# Get coefficients at current time
	alpha_t, sigma_t = self.schedule.get_schedule(t)

	# Compute derivatives using finite differences
	h = torch.full_like(t, self.derivative_eps)
	t_plus = (t + h).clamp(0.0, 1.0)
	t_minus = (t - h).clamp(0.0, 1.0)

	alpha_plus, sigma_plus = self.schedule.get_schedule(t_plus)
	alpha_minus, sigma_minus = self.schedule.get_schedule(t_minus)

	# Derivatives
	dt = (t_plus - t_minus).clamp(min=1e-6)
	d_alpha_dt = (alpha_plus - alpha_minus) / dt
	d_sigma_dt = (sigma_plus - sigma_minus) / dt

	return alpha_t, sigma_t, d_alpha_dt, d_sigma_dt

	def epsilon_to_velocity(self, epsilon_pred: torch.Tensor, x_t: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
	"""
	Correct ε→v conversion for ANY schedule using proper derivatives.

	From ODE: dx_t/dt = d(alpha_t)/dt * x_0 + d(sigma_t)/dt * ε
	This is the TRUE velocity for the schedule!
	"""
	# Get schedule coefficients AND their derivatives
	alpha_t, sigma_t, d_alpha_dt, d_sigma_dt = self._get_schedule_with_derivatives(t)

	# Reshape for broadcasting
	alpha_t = alpha_t.view(-1, 1, 1, 1)
	sigma_t = sigma_t.view(-1, 1, 1, 1)
	d_alpha_dt = d_alpha_dt.view(-1, 1, 1, 1)
	d_sigma_dt = d_sigma_dt.view(-1, 1, 1, 1)

	# Numerical stability: handle small alpha_t
	alpha_safe = torch.clamp(alpha_t, min=0.01)

	# Step 1: Recover x_0 using Tweedie's formula
	x_0_pred = (x_t - sigma_t * epsilon_pred) / alpha_safe

	# Step 2: Clamp x_0 to reasonable range (prevents blow-up)
	# Use adaptive clamping: larger range for VAE latents, tighter for pixel space
	x_0_pred = torch.clamp(x_0_pred, -self.clamp_range, self.clamp_range)

	# Step 3: Compute velocity based on schedule type
	if self.schedule_type == 'linear_interp':
	# For linear interpolation: x_t = (1-t)x_0 + tε
	# Velocity is simply: v = ε - x_0
	v = epsilon_pred - x_0_pred
	else:
	# For cosine and other schedules: use proper derivatives
	# v = d(alpha_t)/dt * x_0 + d(sigma_t)/dt * ε
	v = d_alpha_dt * x_0_pred + d_sigma_dt * epsilon_pred

	# Adaptive velocity scaling for cosine schedule
	# Derivatives vary dramatically with timestep - need adaptive dampening
	if self.schedule_type == 'cosine':
	t_val = t[0].item() if t.numel() > 0 else 0.5
	if t_val > 0.85:
	# Very high noise: derivatives are large, need dampening
	scale = 0.88
	elif t_val > 0.6:
	# Medium-high noise: moderate dampening
	scale = 0.93
	else:
	# Low to medium noise: slight dampening
	scale = 0.96
	v = v * scale

	# Per-channel bias correction to prevent color drift
	# The model has inherent channel bias that gets amplified by integration
	# Remove per-channel mean to prevent accumulation
	# Only apply to color channels (1,2,3), preserve luminance channel (0)
	for c in range(1, 4):
	v[:, c] = v[:, c] - v[:, c].mean()

	return v

	def convert(self, prediction: torch.Tensor, objective_type: str, x_t: torch.Tensor, t: torch.Tensor):
	"""
	Convert any prediction to velocity space.

	Args:
	prediction: expert output
	objective_type: 'ddpm' \| 'fm' \| 'rf'
	x_t: current noisy state
	t: current timesteps

	Returns:
	v: velocity representation
	"""
	if objective_type == "ddpm":
	# Proper ε→v conversion for unified integration
	return self.epsilon_to_velocity(prediction, x_t, t)
	elif objective_type in ["fm", "rf"]:
	return prediction # Already velocity
	else:
	raise ValueError(f"Unknown objective type: {objective_type}")

	# =============================================================================
	# EXPERT MODELS
	# =============================================================================

	class UNetExpert(nn.Module):
	"""UNet expert using diffusers"""

	def __init__(self, config) -> None:
	super().__init__()

	# Default UNet params
	default_params = {
	"sample_size": config.image_size,
	"in_channels": config.num_channels,
	"out_channels": config.num_channels,
	"layers_per_block": 2,
	"block_out_channels": [64, 128, 256, 256],
	"attention_head_dim": 8,
	}

	# Override with config params
	params = {default_params, config.expert_params}

	# Store objective type for heterogeneous training (and remove from params)
	self.objective_type = params.pop("objective_type", "fm")

	# Store and initialize schedule (NEW)
	schedule_type = params.pop("schedule_type", "linear_interp")
	from schedules import NoiseSchedule
	self.schedule = NoiseSchedule(schedule_type)

	self.unet = UNet2DModel(**params)

	def forward(self, xt: torch.Tensor, t: torch.Tensor, **kwargs) -> torch.Tensor:
	# Scale timesteps for diffusers (expects 0-1000)
	# t_scaled = (t * 1000).long()
	t_scaled = (t * 999).round().long().clamp(0, 999)
	return self.unet(xt, t_scaled).sample

	def compute_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""Unified loss computation based on objective type"""
	if self.objective_type == "ddpm":
	return self.ddpm_loss(x0)
	elif self.objective_type == "fm":
	return self.flow_matching_loss(x0)
	elif self.objective_type == "rf":
	return self.rectified_flow_loss(x0)
	else:
	raise ValueError(f"Unknown objective type: {self.objective_type}")

	def ddpm_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""DDPM: predict noise ε"""
	batch_size = x0.shape[0]
	device = x0.device

	t = torch.rand(batch_size, device=device)

	# Use proper schedule (NEW)
	alpha_t, sigma_t = self.schedule.get_schedule(t)

	noise = torch.randn_like(x0)
	xt = alpha_t.view(-1, 1, 1, 1) * x0 + sigma_t.view(-1, 1, 1, 1) * noise

	pred_eps = self.forward(xt, t)
	return F.mse_loss(pred_eps, noise)

	def rectified_flow_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""Rectified Flow: predict velocity v = x_1 - x_0"""
	batch_size = x0.shape[0]
	device = x0.device

	t = torch.rand(batch_size, device=device)
	x1 = torch.randn_like(x0)
	xt = (1 - t).view(-1, 1, 1, 1) * x0 + t.view(-1, 1, 1, 1) * x1

	pred_v = self.forward(xt, t)
	true_v = x1 - x0
	return F.mse_loss(pred_v, true_v)

	def flow_matching_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""Flow matching loss for training"""
	batch_size = x0.shape[0]
	device = x0.device

	# Sample random timesteps
	t = torch.rand(batch_size, device=device)

	# Use proper schedule (NEW)
	alpha_t, sigma_t = self.schedule.get_schedule(t)

	# Add noise
	noise = torch.randn_like(x0)
	xt = alpha_t.view(-1, 1, 1, 1) * x0 + sigma_t.view(-1, 1, 1, 1) * noise

	# Predict velocity
	pred_v = self.forward(xt, t)

	# True velocity for flow matching
	# true_v = x0 - xt
	true_v = noise - x0

	return F.mse_loss(pred_v, true_v)

	class SimpleCNNExpert(nn.Module):
	"""Simple CNN expert for fast training"""

	def __init__(self, config) -> None:
	super().__init__()

	# Default params
	default_params = {
	"hidden_dims": [64, 128, 256],
	"time_dim": 64,
	}
	params = {default_params, config.expert_params}

	# Store objective type for heterogeneous training
	self.objective_type = params.get("objective_type", "fm")

	# Store and initialize schedule (NEW)
	schedule_type = params.get("schedule_type", "linear_interp")
	from schedules import NoiseSchedule
	self.schedule = NoiseSchedule(schedule_type)

	self.time_embedding = TimeEmbedding(params["time_dim"])
	self.target_size = config.image_size

	# Simple encoder-decoder
	self.encoder = self._build_encoder(config.num_channels, params["hidden_dims"])
	self.decoder = self._build_decoder(params["hidden_dims"], config.num_channels)

	# Time conditioning
	self.time_mlp = nn.Sequential(
	nn.Linear(params["time_dim"], params["hidden_dims"][-1]),
	nn.SiLU(),
	nn.Linear(params["hidden_dims"][-1], params["hidden_dims"][-1])
	)

	def _build_encoder(self, in_channels: int, hidden_dims: List[int]) -> nn.Sequential:
	layers = []
	prev_dim = in_channels

	for dim in hidden_dims:
	layers.extend([
	nn.Conv2d(prev_dim, dim, 3, padding=1),
	nn.GroupNorm(8, dim),
	nn.SiLU(),
	nn.Conv2d(dim, dim, 3, padding=1),
	nn.GroupNorm(8, dim),
	nn.SiLU(),
	nn.MaxPool2d(2)
	])
	prev_dim = dim

	return nn.Sequential(*layers)

	def _build_decoder(self, hidden_dims: List[int], out_channels: int) -> nn.Sequential:
	layers = []
	reversed_dims = list(reversed(hidden_dims))

	for i, dim in enumerate(reversed_dims[:-1]):
	next_dim = reversed_dims[i + 1]
	layers.extend([
	nn.ConvTranspose2d(dim, next_dim, 4, stride=2, padding=1),
	nn.GroupNorm(8, next_dim),
	nn.SiLU(),
	nn.Conv2d(next_dim, next_dim, 3, padding=1),
	nn.GroupNorm(8, next_dim),
	nn.SiLU(),
	])

	# Final layer
	layers.append(nn.Conv2d(reversed_dims[-1], out_channels, 3, padding=1))

	return nn.Sequential(*layers)

	def forward(self, xt: torch.Tensor, t: torch.Tensor, **kwargs) -> torch.Tensor:
	# Time embedding
	time_emb = self.time_embedding(t)
	time_features = self.time_mlp(time_emb)

	# Encode
	encoded = self.encoder(xt)

	# Add time conditioning
	time_features = time_features.view(time_features.shape[0], -1, 1, 1)
	time_features = time_features.expand(-1, -1, encoded.shape[2], encoded.shape[3])
	conditioned = encoded + time_features

	# Decode
	output = self.decoder(conditioned)

	# Ensure output matches target size
	output = F.interpolate(output, size=xt.shape[-2:], mode='bilinear', align_corners=False)

	return output

	def compute_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""Unified loss computation based on objective type"""
	if self.objective_type == "ddpm":
	return self.ddpm_loss(x0)
	elif self.objective_type == "fm":
	return self.flow_matching_loss(x0)
	elif self.objective_type == "rf":
	return self.rectified_flow_loss(x0)
	else:
	raise ValueError(f"Unknown objective type: {self.objective_type}")

	def ddpm_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""DDPM: predict noise ε"""
	batch_size = x0.shape[0]
	device = x0.device

	t = torch.rand(batch_size, device=device)

	# Use proper schedule (NEW)
	alpha_t, sigma_t = self.schedule.get_schedule(t)

	noise = torch.randn_like(x0)
	xt = alpha_t.view(-1, 1, 1, 1) * x0 + sigma_t.view(-1, 1, 1, 1) * noise

	pred_eps = self.forward(xt, t)

	# Ensure pred_eps matches noise shape
	if pred_eps.shape != noise.shape:
	pred_eps = F.interpolate(pred_eps, size=noise.shape[-2:], mode='bilinear', align_corners=False)

	return F.mse_loss(pred_eps, noise)

	def rectified_flow_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""Rectified Flow: predict velocity v = x_1 - x_0"""
	batch_size = x0.shape[0]
	device = x0.device

	t = torch.rand(batch_size, device=device)
	x1 = torch.randn_like(x0)
	xt = (1 - t).view(-1, 1, 1, 1) * x0 + t.view(-1, 1, 1, 1) * x1

	pred_v = self.forward(xt, t)
	true_v = x1 - x0

	# Ensure pred_v matches true_v shape
	if pred_v.shape != true_v.shape:
	pred_v = F.interpolate(pred_v, size=true_v.shape[-2:], mode='bilinear', align_corners=False)

	return F.mse_loss(pred_v, true_v)

	def flow_matching_loss(self, x0: torch.Tensor) -> torch.Tensor:
	"""Flow matching loss"""
	batch_size = x0.shape[0]
	device = x0.device

	t = torch.rand(batch_size, device=device)

	# Use proper schedule (NEW)
	alpha_t, sigma_t = self.schedule.get_schedule(t)

	noise = torch.randn_like(x0)
	xt = alpha_t.view(-1, 1, 1, 1) * x0 + sigma_t.view(-1, 1, 1, 1) * noise

	pred_v = self.forward(xt, t)
	# true_v = x0 - xt
	true_v = noise - x0

	# Ensure pred_v matches true_v shape
	if pred_v.shape != true_v.shape:
	pred_v = F.interpolate(pred_v, size=true_v.shape[-2:], mode='bilinear', align_corners=False)

	return F.mse_loss(pred_v, true_v)

	# Helper function from original DiT
	def modulate(x, shift, scale):
	return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)

	# Fixed sin-cos position embedding from original
	def get_2d_sincos_pos_embed(embed_dim, grid_size):
	grid_h = np.arange(grid_size, dtype=np.float32)
	grid_w = np.arange(grid_size, dtype=np.float32)
	grid = np.meshgrid(grid_w, grid_h)
	grid = np.stack(grid, axis=0)
	grid = grid.reshape([2, 1, grid_size, grid_size])

	assert embed_dim % 2 == 0
	emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
	emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])
	emb = np.concatenate([emb_h, emb_w], axis=1)
	return emb

	def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
	assert embed_dim % 2 == 0
	omega = np.arange(embed_dim // 2, dtype=np.float64)
	omega /= embed_dim / 2.
	omega = 1. / 10000**omega
	pos = pos.reshape(-1)
	out = np.einsum('m,d->md', pos, omega)
	emb_sin = np.sin(out)
	emb_cos = np.cos(out)
	emb = np.concatenate([emb_sin, emb_cos], axis=1)
	return emb

	# Timestep Embedder
	class TimestepEmbedder(nn.Module):
	def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
	super().__init__()
	self.frequency_embedding_size = frequency_embedding_size
	self.mlp = nn.Sequential(
	nn.Linear(frequency_embedding_size, hidden_size, bias=True),
	nn.SiLU(),
	nn.Linear(hidden_size, hidden_size, bias=True),
	)

	@staticmethod
	def timestep_embedding(t, dim, max_period=10000):
	half = dim // 2
	freqs = torch.exp(-math.log(max_period) * torch.arange(0, half, dtype=torch.float32, device=t.device) / half)
	args = t[:, None].float() * freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	if dim % 2:
	embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
	return embedding

	def forward(self, t: torch.Tensor) -> torch.Tensor:
	t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
	return self.mlp(t_freq)

	# DiTBlock with proper AdaLN-Zero
	class DiTBlock(nn.Module):
	def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float = 4.0, use_text: bool = False, use_adaln_single: bool = False):
	super().__init__()
	self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
	self.attn = nn.MultiheadAttention(hidden_size, num_heads, dropout=0.1, batch_first=True)
	self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)

	mlp_hidden_dim = int(hidden_size * mlp_ratio)
	self.mlp = nn.Sequential(
	nn.Linear(hidden_size, mlp_hidden_dim),
	nn.GELU(approximate="tanh"), # Match original
	nn.Linear(mlp_hidden_dim, hidden_size),
	)

	# AdaLN modulation - either per-block MLP or AdaLN-Single embeddings
	self.use_adaln_single = use_adaln_single
	if use_adaln_single:
	# AdaLN-Single: use learnable per-block embeddings instead of MLP
	self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
	self.adaLN_modulation = None # No MLP needed
	else:
	# Original AdaLN with per-block MLP
	self.adaLN_modulation = nn.Sequential(
	nn.SiLU(),
	nn.Linear(hidden_size, 6 * hidden_size, bias=True)
	)
	self.scale_shift_table = None

	# Optional text cross-attention
	self.use_text = use_text
	if use_text:
	# Note: PixArt uses xformers which may handle unnormalized queries differently
	# We add a simple norm for stability with PyTorch's MultiheadAttention
	self.norm_cross = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
	self.cross_attn = nn.MultiheadAttention(hidden_size, num_heads, dropout=0.1, batch_first=True)

	def forward(self, x: torch.Tensor, c: torch.Tensor, text_emb: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None):
	# Get modulation parameters
	if self.use_adaln_single:
	# AdaLN-Single: combine global time embedding with per-block parameters
	# c should be pre-computed from global t_block with shape [B, 6*hidden_size]
	B = x.shape[0]
	# Chunk and squeeze to get [B, hidden_size] tensors for compatibility with PyTorch's MultiheadAttention
	temp = (self.scale_shift_table[None] + c.reshape(B, 6, -1)).chunk(6, dim=1)
	shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [t.squeeze(1) for t in temp]
	else:
	# Original AdaLN: compute modulation from per-block MLP
	# Also squeeze after chunk to get [B, hidden_size] for consistency
	temp = self.adaLN_modulation(c).chunk(6, dim=1)
	shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [t.squeeze(1) for t in temp]

	# Self-attention with modulation
	# Both paths now use modulate function for consistency
	x_norm = modulate(self.norm1(x), shift_msa, scale_msa)
	attn_out, _ = self.attn(x_norm, x_norm, x_norm)
	x = x + gate_msa.unsqueeze(1) * attn_out

	# Optional cross-attention
	if self.use_text and text_emb is not None:
	if text_emb.dim() == 2:
	text_emb = text_emb.unsqueeze(1)
	# Convert attention mask to key_padding_mask format (True = ignore)
	# attention_mask: shape [B, T]; either bool (True=keep) or 0/1 numeric (1=keep)
	key_padding_mask = None
	if attention_mask is not None:
	if attention_mask.dtype is not torch.bool:
	# Convert 0/1 (or >=1) to bool keep-mask first
	keep_mask = attention_mask > 0
	else:
	keep_mask = attention_mask
	# key_padding_mask semantics: True = ignore, False = keep
	key_padding_mask = ~keep_mask # logical NOT, not arithmetic subtraction

	# Normalize queries for stability (PixArt uses xformers which may differ)
	x_norm = self.norm_cross(x)
	cross_out, _ = self.cross_attn(x_norm, text_emb, text_emb, key_padding_mask=key_padding_mask)
	x = x + cross_out

	# MLP with modulation
	# Both paths now use modulate function for consistency
	x_norm = modulate(self.norm2(x), shift_mlp, scale_mlp)
	mlp_out = self.mlp(x_norm)
	x = x + gate_mlp.unsqueeze(1) * mlp_out

	return x

	# FinalLayer with AdaLN modulation
	class FinalLayer(nn.Module):
	def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
	super().__init__()
	self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
	self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
	self.adaLN_modulation = nn.Sequential(
	nn.SiLU(),
	nn.Linear(hidden_size, 2 * hidden_size, bias=True)
	)

	def forward(self, x: torch.Tensor, c: torch.Tensor):
	shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
	x = modulate(self.norm_final(x), shift, scale)
	x = self.linear(x)
	return x

	# T2IFinalLayer with AdaLN-Single for parameter efficiency
	class T2IFinalLayer(nn.Module):
	def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
	super().__init__()
	self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
	self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
	# AdaLN-Single: use learnable embeddings instead of MLP
	self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5)
	self.hidden_size = hidden_size

	def forward(self, x: torch.Tensor, t: torch.Tensor):
	# t should be the original time embedding with shape [B, hidden_size]
	# Following PixArt implementation exactly
	shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
	# shift and scale are [B, 1, hidden_size], use t2i_modulate style
	x = self.norm_final(x) * (1 + scale) + shift
	x = self.linear(x)
	return x

	# DiTExpert
	class DiTExpert(nn.Module):
	def __init__(self, config):
	super().__init__()
	default_params = {
	"hidden_size": 768,
	"num_layers": 12,
	"num_heads": 12,
	"patch_size": 2,
	"in_channels": 4,
	"out_channels": 4,
	"use_text_conditioning": False,
	"use_class_conditioning": False,
	"num_classes": 1000, # ImageNet classes
	"mlp_ratio": 4.0,
	"text_embed_dim": 768,
	"use_dit_time_embed": False,
	}
	params = {default_params, config.expert_params}

	self.patch_size = params["patch_size"]
	self.in_channels = params["in_channels"]
	self.out_channels = params["out_channels"]
	self.hidden_size = params["hidden_size"]
	self.num_heads = params["num_heads"]
	self.use_text = params.get("use_text_conditioning", False)
	self.use_class = params.get("use_class_conditioning", False)
	self.cfg_dropout_prob = params.get("cfg_dropout_prob", 0.1) # 10% dropout for CFG
	self.text_embed_dim = params.get("text_embed_dim", 768)
	self.use_adaln_single = params.get("use_adaln_single", False) # AdaLN-Single for parameter efficiency
	self.depth = params["num_layers"]

	# Store objective type for heterogeneous training
	self.objective_type = params.get("objective_type", "fm")

	# Store and initialize schedule (NEW)
	schedule_type = params.get("schedule_type", "linear_interp")
	from schedules import NoiseSchedule
	self.schedule = NoiseSchedule(schedule_type)

	# Validation: cannot use both text and class conditioning simultaneously
	assert not (self.use_text and self.use_class), "Cannot use both text and class conditioning simultaneously"

	# Patch embedding
	self.patch_embed = nn.Conv2d(self.in_channels, self.hidden_size,
	kernel_size=self.patch_size, stride=self.patch_size)

	# Fixed sin-cos positional embedding
	latent_size = getattr(config, 'image_size', 32)
	self.num_patches = (latent_size // self.patch_size) ** 2
	self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, self.hidden_size), requires_grad=False)

	# Time embedding
	self.use_dit_time_embed = params.get("use_dit_time_embed", False)
	if self.use_dit_time_embed:
	self.time_embed = DiTTimestepEmbedder(self.hidden_size)
	else:
	self.time_embed = TimestepEmbedder(self.hidden_size)

	# Global time block for AdaLN-Single
	if self.use_adaln_single:
	self.t_block = nn.Sequential(
	nn.SiLU(),
	nn.Linear(self.hidden_size, 6 * self.hidden_size, bias=True)
	)

	# Optional text conditioning
	if self.use_text:
	self.text_proj = nn.Linear(self.text_embed_dim, self.hidden_size)
	self.text_norm = nn.LayerNorm(self.hidden_size, elementwise_affine=False, eps=1e-6)
	# Note: null text embedding will be provided by empty string encoding from CLIP
	# This is handled in the training loop, not as a learnable parameter

	# Optional class conditioning (ImageNet style)
	if self.use_class:
	# Add 1 extra embedding for null/unconditional class
	self.class_embed = nn.Embedding(params["num_classes"] + 1, self.hidden_size)
	self.null_class_id = params["num_classes"] # Use last index as null class

	# Transformer blocks
	self.layers = nn.ModuleList([
	DiTBlock(self.hidden_size, self.num_heads, params.get("mlp_ratio", 4.0),
	self.use_text, use_adaln_single=self.use_adaln_single)
	for _ in range(self.depth)
	])

	# Final layer with modulation
	if self.use_adaln_single:
	self.final_layer = T2IFinalLayer(self.hidden_size, self.patch_size, self.out_channels)
	else:
	self.final_layer = FinalLayer(self.hidden_size, self.patch_size, self.out_channels)

	# Initialize weights
	self.initialize_weights()

	def initialize_weights(self):
	# Initialize transformer layers
	def _basic_init(module):
	if isinstance(module, nn.Linear):
	torch.nn.init.xavier_uniform_(module.weight)
	if module.bias is not None:
	nn.init.constant_(module.bias, 0)
	self.apply(_basic_init)

	# Initialize positional embedding with sin-cos
	grid_size = int(self.num_patches ** 0.5)
	pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], grid_size)
	self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))

	# Initialize patch_embed like nn.Linear
	w = self.patch_embed.weight.data
	nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
	if self.patch_embed.bias is not None:
	nn.init.constant_(self.patch_embed.bias, 0)

	# Initialize timestep embedding MLP
	nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
	nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)

	# Zero-out adaLN modulation layers in DiT blocks (from DiT paper)
	for block in self.layers:
	if block.adaLN_modulation is not None:
	# Original AdaLN mode
	nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
	# AdaLN-Single mode: scale_shift_table is already initialized with randn/sqrt(hidden_size)

	# Zero-out cross-attention output projection (from PixArt-Alpha)
	if self.use_text and hasattr(block, 'cross_attn'):
	nn.init.constant_(block.cross_attn.out_proj.weight, 0)
	nn.init.constant_(block.cross_attn.out_proj.bias, 0)

	# Initialize text projection layer (analogous to PixArt's caption embedding)
	if self.use_text and hasattr(self, 'text_proj'):
	nn.init.normal_(self.text_proj.weight, std=0.02)
	if self.text_proj.bias is not None:
	nn.init.constant_(self.text_proj.bias, 0)

	# Initialize class embedding layer (similar to DiT paper)
	if self.use_class and hasattr(self, 'class_embed'):
	nn.init.normal_(self.class_embed.weight, std=0.02)

	# Initialize global t_block for AdaLN-Single
	if self.use_adaln_single and hasattr(self, 't_block'):
	nn.init.normal_(self.t_block[1].weight, std=0.02)
	# Zero-out t_block initially for stability
	nn.init.constant_(self.t_block[1].bias, 0)

	# Zero-out output layers
	if hasattr(self.final_layer, 'adaLN_modulation') and self.final_layer.adaLN_modulation is not None:
	# Original FinalLayer
	nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
	# T2IFinalLayer scale_shift_table is already initialized with randn/sqrt(hidden_size)
	nn.init.constant_(self.final_layer.linear.weight, 0)
	nn.init.constant_(self.final_layer.linear.bias, 0)

	def forward(self, xt: torch.Tensor, t: torch.Tensor, text_embeds: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
	B, C, H, W = xt.shape

	# Handle timestep scaling - DiT expects timesteps in [0, 999] range
	# If t is normalized (in [0, 1]), scale it to [0, 999]
	if t.max() <= 1.0 and t.min() >= 0.0:
	# Normalized timesteps, scale to DiT range
	t = t * 999.0
	# Ensure t is in correct range for DiT
	t = t.clamp(0, 999)

	# Patchify
	x = self.patch_embed(xt) # [B, hidden_size, H//p, W//p]
	x = x.flatten(2).transpose(1, 2) # [B, num_patches, hidden_size]
	x = x + self.pos_embed # Add positional embedding

	# Prepare conditioning
	time_emb = self.time_embed(t) # [B, hidden_size]

	# Add class conditioning to time embedding if using class conditioning
	if self.use_class and class_labels is not None:
	class_emb = self.class_embed(class_labels) # [B, hidden_size]
	time_emb = time_emb + class_emb # Additive combination following DiT paper

	# Process conditioning based on AdaLN mode
	if self.use_adaln_single:
	# AdaLN-Single: compute global modulation once
	c = self.t_block(time_emb) # [B, 6*hidden_size]
	else:
	# Original AdaLN: pass time embedding to each block
	c = time_emb

	# Prepare text tokens for cross-attention (not fused with time)
	text_tokens = None
	if self.use_text and text_embeds is not None:
	if text_embeds.dim() == 3:
	text_tokens = self.text_proj(text_embeds) # [B, T, hidden_size]
	text_tokens = self.text_norm(text_tokens)
	else:
	text_tokens = self.text_proj(text_embeds).unsqueeze(1) # [B, 1, hidden_size]
	text_tokens = self.text_norm(text_tokens)

	if attention_mask is not None:
	# cast to bool, clamp shapes to text_tokens length
	attention_mask = attention_mask[:, :text_tokens.shape[1]].to(torch.bool)
	# safety: avoid all-false rows (would yield NaNs in softmax)
	all_false = attention_mask.sum(dim=1) == 0
	if all_false.any():
	attention_mask[all_false, 0] = True

	# Apply transformer blocks
	for layer in self.layers:
	x = layer(x, c, text_tokens, attention_mask)

	# Final projection
	if self.use_adaln_single:
	# T2IFinalLayer expects original time embedding, not global modulation
	x = self.final_layer(x, time_emb) # [B, num_patches, patch_size^2 * out_channels]
	else:
	# Original FinalLayer expects conditioning
	x = self.final_layer(x, c) # [B, num_patches, patch_size^2 * out_channels]

	# Unpatchify
	patch_h = patch_w = int(self.num_patches ** 0.5)
	x = x.view(B, patch_h, patch_w, self.patch_size, self.patch_size, self.out_channels)
	x = x.permute(0, 5, 1, 3, 2, 4).contiguous()
	x = x.view(B, self.out_channels, H, W)

	return x

	def compute_loss(self, x0: torch.Tensor, text_embeds: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None,
	null_text_embeds: Optional[torch.Tensor] = None, null_attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""Unified loss computation based on objective type"""
	if self.objective_type == "ddpm":
	return self.ddpm_loss(x0, text_embeds, attention_mask, class_labels, null_text_embeds, null_attention_mask)
	elif self.objective_type == "fm":
	return self.flow_matching_loss(x0, text_embeds, attention_mask, class_labels, null_text_embeds, null_attention_mask)
	elif self.objective_type == "rf":
	return self.rectified_flow_loss(x0, text_embeds, attention_mask, class_labels, null_text_embeds, null_attention_mask)
	else:
	raise ValueError(f"Unknown objective type: {self.objective_type}")

	def ddpm_loss(self, x0: torch.Tensor, text_embeds: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None,
	null_text_embeds: Optional[torch.Tensor] = None, null_attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""DDPM: predict noise ε"""
	B = x0.shape[0]
	device = x0.device

	# Sample time uniformly
	t = torch.rand(B, device=device)

	# Use proper schedule (NEW)
	alpha_t, sigma_t = self.schedule.get_schedule(t)

	noise = torch.randn_like(x0)
	xt = alpha_t.view(-1, 1, 1, 1) * x0 + sigma_t.view(-1, 1, 1, 1) * noise

	# Apply CFG dropout during training
	if self.training and self.cfg_dropout_prob > 0:
	if self.use_text and text_embeds is not None:
	keep = (torch.rand(B, device=device) > self.cfg_dropout_prob) # True = keep text

	if null_text_embeds is not None:
	# Use provided null text embeddings (from empty string CLIP encoding)
	if null_text_embeds.shape[0] == 1:
	null_text_embeds = null_text_embeds.expand(B, -1, -1)

	# Replace dropped samples with null text embeddings
	dropped = ~keep
	if dropped.any():
	text_embeds = text_embeds.clone()
	text_embeds[dropped] = null_text_embeds[dropped]

	# Use provided null attention mask or create default for empty string
	if attention_mask is not None:
	attention_mask = attention_mask.clone()
	if null_attention_mask is not None:
	if null_attention_mask.shape[0] == 1:
	null_attention_mask = null_attention_mask.expand(B, -1)
	attention_mask[dropped] = null_attention_mask[dropped]
	else:
	attention_mask[dropped] = 0
	attention_mask[dropped, 0] = 1
	else:
	# Fallback to old zeroing approach if null_text_embeds not provided
	if text_embeds.dim() == 3: # [B, T, D]
	text_embeds = text_embeds * keep[:, None, None].to(text_embeds.dtype)
	else: # [B, D]
	text_embeds = text_embeds * keep[:, None].to(text_embeds.dtype)

	if attention_mask is not None:
	attention_mask = attention_mask.clone()
	dropped = ~keep
	if dropped.any():
	attention_mask[dropped, 0] = 1

	elif self.use_class and class_labels is not None:
	# Apply CFG dropout to class labels using null class embedding
	keep = (torch.rand(B, device=device) > self.cfg_dropout_prob)
	null_class = torch.full_like(class_labels, self.null_class_id)
	class_labels = torch.where(keep, class_labels, null_class)

	# Predict noise
	pred_eps = self.forward(xt, t, text_embeds, attention_mask, class_labels)

	return F.mse_loss(pred_eps, noise)

	def rectified_flow_loss(self, x0: torch.Tensor, text_embeds: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None,
	null_text_embeds: Optional[torch.Tensor] = None, null_attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""Rectified Flow: predict velocity v = x_1 - x_0 (straight paths)"""
	B = x0.shape[0]
	device = x0.device

	# Sample time uniformly
	t = torch.rand(B, device=device)

	# Straight-line interpolation
	x1 = torch.randn_like(x0) # Gaussian noise as x_1
	xt = (1 - t).view(-1, 1, 1, 1) * x0 + t.view(-1, 1, 1, 1) * x1

	# Apply CFG dropout during training
	if self.training and self.cfg_dropout_prob > 0:
	if self.use_text and text_embeds is not None:
	keep = (torch.rand(B, device=device) > self.cfg_dropout_prob) # True = keep text

	if null_text_embeds is not None:
	# Use provided null text embeddings (from empty string CLIP encoding)
	if null_text_embeds.shape[0] == 1:
	null_text_embeds = null_text_embeds.expand(B, -1, -1)

	# Replace dropped samples with null text embeddings
	dropped = ~keep
	if dropped.any():
	text_embeds = text_embeds.clone()
	text_embeds[dropped] = null_text_embeds[dropped]

	# Use provided null attention mask or create default for empty string
	if attention_mask is not None:
	attention_mask = attention_mask.clone()
	if null_attention_mask is not None:
	if null_attention_mask.shape[0] == 1:
	null_attention_mask = null_attention_mask.expand(B, -1)
	attention_mask[dropped] = null_attention_mask[dropped]
	else:
	attention_mask[dropped] = 0
	attention_mask[dropped, 0] = 1
	else:
	# Fallback to old zeroing approach if null_text_embeds not provided
	if text_embeds.dim() == 3: # [B, T, D]
	text_embeds = text_embeds * keep[:, None, None].to(text_embeds.dtype)
	else: # [B, D]
	text_embeds = text_embeds * keep[:, None].to(text_embeds.dtype)

	if attention_mask is not None:
	attention_mask = attention_mask.clone()
	dropped = ~keep
	if dropped.any():
	attention_mask[dropped, 0] = 1

	elif self.use_class and class_labels is not None:
	# Apply CFG dropout to class labels using null class embedding
	keep = (torch.rand(B, device=device) > self.cfg_dropout_prob)
	null_class = torch.full_like(class_labels, self.null_class_id)
	class_labels = torch.where(keep, class_labels, null_class)

	# Predict velocity (x_1 - x_0)
	pred_v = self.forward(xt, t, text_embeds, attention_mask, class_labels)
	true_v = x1 - x0

	return F.mse_loss(pred_v, true_v)

	def flow_matching_loss(self, x0: torch.Tensor, text_embeds: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None,
	null_text_embeds: Optional[torch.Tensor] = None, null_attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""Flow matching loss for latent space training with CFG dropout."""
	B = x0.shape[0]
	device = x0.device

	# Sample time uniformly
	t = torch.rand(B, device=device)

	# Use proper schedule (NEW)
	alpha_t, sigma_t = self.schedule.get_schedule(t)

	noise = torch.randn_like(x0)
	xt = alpha_t.view(-1, 1, 1, 1) * x0 + sigma_t.view(-1, 1, 1, 1) * noise

	# Apply CFG dropout during training
	if self.training and self.cfg_dropout_prob > 0:
	if self.use_text and text_embeds is not None:
	keep = (torch.rand(B, device=device) > self.cfg_dropout_prob) # True = keep text

	if null_text_embeds is not None:
	# Use provided null text embeddings (from empty string CLIP encoding)
	# Ensure null_text_embeds matches the batch size
	if null_text_embeds.shape[0] == 1:
	null_text_embeds = null_text_embeds.expand(B, -1, -1)

	# Replace dropped samples with null text embeddings
	dropped = ~keep
	if dropped.any():
	text_embeds = text_embeds.clone()
	text_embeds[dropped] = null_text_embeds[dropped]

	# Use provided null attention mask or create default for empty string
	if attention_mask is not None:
	attention_mask = attention_mask.clone()
	if null_attention_mask is not None:
	# Ensure null_attention_mask matches batch size
	if null_attention_mask.shape[0] == 1:
	null_attention_mask = null_attention_mask.expand(B, -1)
	attention_mask[dropped] = null_attention_mask[dropped]
	else:
	# Default: For null text (empty string), typically only the first token is valid
	attention_mask[dropped] = 0
	attention_mask[dropped, 0] = 1 # Keep only first token for empty string
	else:
	# Fallback to old zeroing approach if null_text_embeds not provided
	if text_embeds.dim() == 3: # [B, T, D]
	text_embeds = text_embeds * keep[:, None, None].to(text_embeds.dtype)
	else: # [B, D]
	text_embeds = text_embeds * keep[:, None].to(text_embeds.dtype)

	# Handle attention mask for fallback approach
	if attention_mask is not None:
	attention_mask = attention_mask.clone()
	dropped = ~keep
	if dropped.any():
	attention_mask[dropped, 0] = 1

	elif self.use_class and class_labels is not None:
	# Apply CFG dropout to class labels using null class embedding
	keep = (torch.rand(B, device=device) > self.cfg_dropout_prob) # True = keep class
	# Use the dedicated null class embedding for unconditional generation
	null_class = torch.full_like(class_labels, self.null_class_id)
	class_labels = torch.where(keep, class_labels, null_class)

	# Predict velocity
	pred_v = self.forward(xt, t, text_embeds, attention_mask, class_labels)
	true_v = noise - x0

	return F.mse_loss(pred_v, true_v)

	# =============================================================================
	# ROUTER MODELS
	# =============================================================================

	class ViTRouter(nn.Module):
	"""ViT-based router for cluster classification"""

	def __init__(self, config) -> None:
	super().__init__()

	# Default params
	default_params = {
	"hidden_size": 384,
	"num_layers": 6,
	"num_heads": 6,
	"patch_size": 8,
	"use_dit_time_embed": False, # Whether to use DiT-style time embedding
	}
	params = {default_params, config.router_params}

	if config.router_pretrained:
	# Use pretrained ViT and adapt
	self.vit = ViTForImageClassification.from_pretrained(
	"google/vit-base-patch16-224"
	)
	self._adapt_pretrained(config, params)
	else:
	# Build from scratch
	vit_config = ViTConfig(
	image_size=config.image_size,
	patch_size=params["patch_size"],
	num_channels=config.num_channels,
	hidden_size=params["hidden_size"],
	num_hidden_layers=params["num_layers"],
	num_attention_heads=params["num_heads"],
	num_labels=config.num_clusters
	)
	self.vit = ViTForImageClassification(vit_config)

	# Time conditioning - support both embedding styles
	self.use_dit_time_embed = params.get("use_dit_time_embed", False)
	if self.use_dit_time_embed:
	# Use DiT-style timestep embedding for consistency
	self.time_embedding = DiTTimestepEmbedder(params["hidden_size"])
	else:
	# Original simple time embedding
	self.time_embedding = nn.Sequential(
	nn.Linear(1, params["hidden_size"]),
	nn.SiLU(),
	nn.Linear(params["hidden_size"], params["hidden_size"])
	)

	# Combined classifier
	self.classifier = nn.Sequential(
	nn.Linear(params["hidden_size"] * 2, params["hidden_size"]),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(params["hidden_size"], config.num_clusters)
	)

	def _adapt_pretrained(self, config, params) -> ViTForImageClassification:
	"""Adapt pretrained ViT for our task"""
	# Modify patch embeddings if needed
	if config.image_size != 224 or config.num_channels != 3:
	self.vit.vit.embeddings.patch_embeddings.projection = nn.Conv2d(
	config.num_channels,
	self.vit.config.hidden_size,
	kernel_size=params["patch_size"],
	stride=params["patch_size"]
	)

	def forward(self, xt: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
	# Process image through ViT
	vit_outputs = self.vit.vit(xt)
	image_features = vit_outputs.last_hidden_state[:, 0] # CLS token

	# Time conditioning
	if self.use_dit_time_embed:
	# DiT embedder expects raw timesteps
	time_features = self.time_embedding(t)
	else:
	# Original embedding needs unsqueeze
	time_features = self.time_embedding(t.unsqueeze(-1))

	# Combine and classify
	combined = torch.cat([image_features, time_features], dim=1)
	return self.classifier(combined)

	class CNNRouter(nn.Module):
	"""Simple CNN router for cluster classification"""

	def __init__(self, config) -> None:
	super().__init__()

	# Default params
	default_params = {
	"hidden_dims": [64, 128, 256],
	"use_dit_time_embed": False, # Whether to use DiT-style time embedding
	}
	params = {default_params, config.router_params}

	# CNN backbone
	self.backbone = self._build_cnn(config.num_channels, params["hidden_dims"])

	# Time embedding - support both styles
	self.use_dit_time_embed = params.get("use_dit_time_embed", False)
	if self.use_dit_time_embed:
	# Use DiT-style timestep embedding, output to 128 dims for CNN
	self.time_embedding = DiTTimestepEmbedder(128)
	else:
	# Original simple time embedding
	self.time_embedding = nn.Sequential(
	nn.Linear(1, 128),
	nn.SiLU(),
	nn.Linear(128, 128)
	)

	# Classifier
	self.classifier = nn.Sequential(
	nn.Linear(params["hidden_dims"][-1] + 128, 256),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(256, config.num_clusters)
	)

	def _build_cnn(self, in_channels: int, hidden_dims: List[int]) -> nn.Sequential:
	layers = []
	prev_dim = in_channels

	for dim in hidden_dims:
	layers.extend([
	nn.Conv2d(prev_dim, dim, 3, padding=1),
	nn.ReLU(),
	nn.Conv2d(dim, dim, 3, padding=1),
	nn.ReLU(),
	nn.MaxPool2d(2)
	])
	prev_dim = dim

	layers.append(nn.AdaptiveAvgPool2d(1))
	layers.append(nn.Flatten())

	return nn.Sequential(*layers)

	def forward(self, xt: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
	# CNN features
	img_features = self.backbone(xt)

	# Time features
	if self.use_dit_time_embed:
	# DiT embedder expects raw timesteps
	time_features = self.time_embedding(t)
	else:
	# Original embedding needs unsqueeze
	time_features = self.time_embedding(t.unsqueeze(-1))

	# Combine and classify
	combined = torch.cat([img_features, time_features], dim=1)
	return self.classifier(combined)

	class DiTRouter(nn.Module):
	"""DiT B/2 router for cluster classification"""

	def __init__(self, config):
	super().__init__()

	# DiT B/2 specifications
	default_params = {
	"hidden_size": 768, # DiT-B uses 768
	"num_layers": 12, # DiT-B uses 12 layers
	"num_heads": 12, # DiT-B uses 12 heads
	"patch_size": 2, # For latent space (32x32 -> 16x16 patches)
	"in_channels": 4, # VAE latent channels
	"mlp_ratio": 4.0,
	"use_dit_time_embed": False, # Whether to use DiT-style time embedding
	}
	params = {default_params, config.router_params}

	self.patch_size = params["patch_size"]
	self.in_channels = params["in_channels"]
	self.hidden_size = params["hidden_size"]
	self.num_heads = params["num_heads"]
	self.num_clusters = config.num_clusters

	# Patch embedding (same as expert)
	self.patch_embed = nn.Conv2d(
	self.in_channels, self.hidden_size,
	kernel_size=self.patch_size, stride=self.patch_size
	)

	# Calculate number of patches
	latent_size = getattr(config, 'image_size', 32) # Assuming 256/8=32 for VAE
	self.num_patches = (latent_size // self.patch_size) ** 2

	# Fixed sin-cos positional embedding (same as expert)
	self.pos_embed = nn.Parameter(
	torch.zeros(1, self.num_patches, self.hidden_size),
	requires_grad=False
	)

	# CLS token (KEY ADDITION from paper)
	self.cls_token = nn.Parameter(torch.zeros(1, 1, self.hidden_size))

	# Time embedding - match expert's choice
	self.use_dit_time_embed = params.get("use_dit_time_embed", False)
	if self.use_dit_time_embed:
	self.time_embed = DiTTimestepEmbedder(self.hidden_size)
	else:
	self.time_embed = TimestepEmbedder(self.hidden_size)

	# DiT blocks with AdaLN (reuse DiTBlock from expert)
	# Note: Router doesn't need text conditioning
	self.layers = nn.ModuleList([
	DiTBlock(self.hidden_size, self.num_heads, params["mlp_ratio"], use_text=False)
	for _ in range(params["num_layers"])
	])

	# Final layer norm
	self.norm_final = nn.LayerNorm(self.hidden_size, elementwise_affine=False, eps=1e-6)

	# Linear classifier on CLS token (as specified in paper)
	# self.head = nn.Linear(self.hidden_size, self.num_clusters)
	self.head = nn.Sequential(
	nn.Linear(self.hidden_size, self.hidden_size),
	nn.GELU(),
	nn.LayerNorm(self.hidden_size),
	nn.Dropout(0.1),
	nn.Linear(self.hidden_size, self.num_clusters)
	)

	# Initialize weights
	self.initialize_weights()

	def initialize_weights(self):
	# Initialize transformer layers
	def _basic_init(module):
	if isinstance(module, nn.Linear):
	torch.nn.init.xavier_uniform_(module.weight)
	if module.bias is not None:
	nn.init.constant_(module.bias, 0)
	self.apply(_basic_init)

	# Initialize CLS token
	nn.init.normal_(self.cls_token, std=0.02)

	# Initialize positional embedding with sin-cos (same as expert)
	grid_size = int(self.num_patches ** 0.5)
	pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], grid_size)
	self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))

	# Initialize patch_embed like nn.Linear
	w = self.patch_embed.weight.data
	nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
	if self.patch_embed.bias is not None:
	nn.init.constant_(self.patch_embed.bias, 0)

	# Initialize timestep embedding MLP
	if hasattr(self.time_embed, 'mlp'):
	nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
	nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)

	# Zero-out adaLN modulation in blocks (following expert initialization)
	for block in self.layers:
	nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(block.adaLN_modulation[-1].bias, 0)

	# # Initialize classification head (simpler version for classification head)
	# nn.init.constant_(self.head.weight, 0)
	# nn.init.constant_(self.head.bias, 0)

	# Initialize classification head (Sequential)
	# Initialize intermediate layers normally, zero-out final layer
	nn.init.normal_(self.head[0].weight, std=0.02) # First linear layer
	if self.head[0].bias is not None:
	nn.init.constant_(self.head[0].bias, 0)

	# Zero-out final classification layer (following DiT paper)
	nn.init.constant_(self.head[-1].weight, 0) # Last linear layer
	if self.head[-1].bias is not None:
	nn.init.constant_(self.head[-1].bias, 0)

	def forward(self, xt: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
	B, C, H, W = xt.shape

	# Match expert's timestep interpretation
	if t.max() <= 1.0 and t.min() >= 0.0:
	t = t * 999.0
	t = t.clamp(0, 999)

	# Patchify
	x = self.patch_embed(xt) # [B, hidden_size, H//p, W//p]
	x = x.flatten(2).transpose(1, 2) # [B, num_patches, hidden_size]

	# Add positional embedding
	x = x + self.pos_embed

	# Prepend CLS token
	cls_tokens = self.cls_token.expand(B, -1, -1) # [B, 1, hidden_size]
	x = torch.cat([cls_tokens, x], dim=1) # [B, 1 + num_patches, hidden_size]

	# Time conditioning
	c = self.time_embed(t) # [B, hidden_size]

	# Apply DiT blocks with AdaLN modulation
	for layer in self.layers:
	x = layer(x, c, text_emb=None)

	# Extract CLS token and apply final norm
	cls_output = x[:, 0] # [B, hidden_size]
	cls_output = self.norm_final(cls_output)

	# Linear classification head
	logits = self.head(cls_output) # [B, num_clusters]

	return logits

	# =============================================================================
	# DETERMINISTIC ROUTER (for controlled experiments)
	# =============================================================================

	class DeterministicTimestepRouter(nn.Module):
	"""
	Deterministic router that assigns experts based on timestep.

	Useful for controlled experiments where you want to test specific routing strategies,
	such as: "high noise → DDPM expert, low noise → FM expert"

	Args:
	config: Config object with router_params containing:
	- timestep_threshold: t value to switch experts (default: 0.5)
	- high_noise_expert: Expert ID for t > threshold (default: 0, typically DDPM)
	- low_noise_expert: Expert ID for t <= threshold (default: 1, typically FM)

	Example config:
	router_architecture: "deterministic_timestep"
	router_params:
	timestep_threshold: 0.5
	high_noise_expert: 0 # DDPM for high noise
	low_noise_expert: 1 # FM for low noise
	"""

	def __init__(self, config):
	super().__init__()
	self.num_experts = config.num_experts
	self.threshold = config.router_params.get('timestep_threshold', 0.5)
	self.high_noise_expert = config.router_params.get('high_noise_expert', 0)
	self.low_noise_expert = config.router_params.get('low_noise_expert', 1)

	# Validate expert IDs
	assert 0 <= self.high_noise_expert < self.num_experts, \
	f"high_noise_expert {self.high_noise_expert} out of range [0, {self.num_experts})"
	assert 0 <= self.low_noise_expert < self.num_experts, \
	f"low_noise_expert {self.low_noise_expert} out of range [0, {self.num_experts})"

	# Validate threshold
	assert 0.0 <= self.threshold <= 1.0, \
	f"timestep_threshold {self.threshold} must be in [0, 1]"

	# This router has no trainable parameters
	# Register threshold as buffer (not trained, but saved with model)
	self.register_buffer('_threshold', torch.tensor(self.threshold))

	print(f"DeterministicTimestepRouter initialized:")
	print(f" Threshold: {self.threshold}")
	print(f" High noise (t > {self.threshold}) → Expert {self.high_noise_expert}")
	print(f" Low noise (t <= {self.threshold}) → Expert {self.low_noise_expert}")

	def forward(self, x: torch.Tensor, t: torch.Tensor, **kwargs) -> torch.Tensor:
	"""
	Returns one-hot routing probabilities based on timestep.

	Args:
	x: Input tensor (unused, but kept for API compatibility with other routers)
	t: Timesteps, shape (B,)

	Returns:
	routing_probs: Shape (B, num_experts), one-hot encoded
	"""
	B = t.shape[0]
	device = t.device

	# Initialize routing probabilities (all zeros)
	routing_probs = torch.zeros(B, self.num_experts, device=device)

	# High noise (t > threshold) → high_noise_expert
	# Low noise (t <= threshold) → low_noise_expert
	high_noise_mask = t > self.threshold
	routing_probs[high_noise_mask, self.high_noise_expert] = 1.0
	routing_probs[~high_noise_mask, self.low_noise_expert] = 1.0

	return routing_probs

	def train(self, mode: bool = True):
	"""Override train() - this router is never trained, always in eval mode"""
	return super(DeterministicTimestepRouter, self).train(False)

	# =============================================================================
	# ADAPTIVE VIDEO ROUTER (for Video DDM)
	# =============================================================================

	class AdaptiveVideoRouter(nn.Module):
	"""
	Time-adaptive router for video DDM.

	Key innovation: Learns optimal weighting of information sources
	at each noise level, solving the "motion invisible at t=1" problem.

	Information availability is time-dependent:
	t ~ 1.0: Only text/first_frame informative → Route on conditioning
	t ~ 0.5: Structure emerging → Latent becomes useful
	t ~ 0.1: Near clean → Full information available

	Expected learned behavior:
	\| Noise Level \| Text \| Frame \| Latent \| Behavior \|
	\|-------------\|------\|-------\|--------\|-----------------------------\|
	\| t ~ 1.0 \| ~0.7 \| ~0.2 \| ~0.1 \| Routes on text semantics \|
	\| t ~ 0.5 \| ~0.4 \| ~0.3 \| ~0.3 \| Balanced; emerging structure\|
	\| t ~ 0.1 \| ~0.2 \| ~0.2 \| ~0.6 \| Trusts latent; fine-grained \|

	Enhancements:
	- Masked mean pooling for text (handles variable-length prompts)
	- Temporal-aware latent encoder (captures motion patterns)
	- Temperature scaling for inference control
	"""

	def __init__(self, config):
	super().__init__()

	# Default params
	default_params = {
	"hidden_dim": 512,
	"text_embed_dim": 768, # CLIP-L text embedding dimension
	"frame_embed_dim": 768, # DINOv2-B (base) feature dimension
	"latent_channels": 16, # VAE latent channels (CogVideoX uses 16)
	"latent_conv_dim": 64, # Intermediate conv channels for latent encoder
	"dropout": 0.1,
	"temporal_pool_mode": "attention", # "attention", "avg", or "max"
	"normalize_inputs": True, # L2-normalize text/frame inputs (match clustering)
	}
	params = {default_params, getattr(config, 'router_params', {})}

	self.hidden_dim = params["hidden_dim"]
	self.num_experts = getattr(config, 'num_experts', config.num_clusters)
	self.latent_channels = params["latent_channels"]
	self.latent_conv_dim = params["latent_conv_dim"]
	self.temporal_pool_mode = params["temporal_pool_mode"]
	self.normalize_inputs = params.get("normalize_inputs", True)

	# === Information Source Encoders ===

	# Text pathway (always available, primary signal at high t)
	self.text_encoder = nn.Sequential(
	nn.Linear(params["text_embed_dim"], self.hidden_dim),
	nn.LayerNorm(self.hidden_dim),
	nn.GELU(),
	nn.Linear(self.hidden_dim, self.hidden_dim)
	)

	# First frame pathway (available for I2V tasks)
	# Uses DINOv2 features extracted from the first frame
	self.frame_encoder = nn.Sequential(
	nn.Linear(params["frame_embed_dim"], self.hidden_dim),
	nn.LayerNorm(self.hidden_dim),
	nn.GELU(),
	nn.Linear(self.hidden_dim, self.hidden_dim)
	)

	# === Temporal-Aware Latent Encoder ===
	# Captures both spatial content and temporal motion patterns

	# Spatial feature extraction (per-frame)
	self.spatial_conv = nn.Sequential(
	nn.Conv3d(params["latent_channels"], params["latent_conv_dim"],
	kernel_size=(1, 3, 3), padding=(0, 1, 1)), # Spatial only
	nn.GroupNorm(8, params["latent_conv_dim"]),
	nn.GELU(),
	)

	# Temporal feature extraction (motion patterns)
	self.temporal_conv = nn.Sequential(
	nn.Conv3d(params["latent_conv_dim"], params["latent_conv_dim"],
	kernel_size=(3, 1, 1), padding=(1, 0, 0)), # Temporal only
	nn.GroupNorm(8, params["latent_conv_dim"]),
	nn.GELU(),
	)

	# Combined spatio-temporal processing
	self.st_conv = nn.Sequential(
	nn.Conv3d(params["latent_conv_dim"], params["latent_conv_dim"],
	kernel_size=3, padding=1), # Full 3D
	nn.GroupNorm(8, params["latent_conv_dim"]),
	nn.GELU(),
	)

	# Spatial pooling (keep temporal dimension)
	self.spatial_pool = nn.AdaptiveAvgPool3d((None, 1, 1)) # [B, C, T, 1, 1]

	# Temporal attention pooling (learns which frames matter for routing)
	if self.temporal_pool_mode == "attention":
	self.temporal_attn = nn.Sequential(
	nn.Linear(params["latent_conv_dim"], params["latent_conv_dim"] // 4),
	nn.Tanh(),
	nn.Linear(params["latent_conv_dim"] // 4, 1),
	)

	# Motion feature extractor (frame differences)
	self.motion_encoder = nn.Sequential(
	nn.Linear(params["latent_conv_dim"], params["latent_conv_dim"]),
	nn.GELU(),
	nn.Linear(params["latent_conv_dim"], self.hidden_dim // 2),
	)

	# Content feature projector
	self.content_proj = nn.Linear(params["latent_conv_dim"], self.hidden_dim // 2)

	# Final latent projection (combines content + motion)
	self.latent_proj = nn.Sequential(
	nn.Linear(self.hidden_dim, self.hidden_dim),
	nn.LayerNorm(self.hidden_dim),
	)

	# === Time-Dependent Weighting ===

	# Time embedding using existing infrastructure
	self.time_embed = TimestepEmbedder(self.hidden_dim)

	self.time_mlp = nn.Sequential(
	nn.Linear(self.hidden_dim, self.hidden_dim),
	nn.GELU(),
	nn.Linear(self.hidden_dim, self.hidden_dim)
	)

	# Learns adaptive weighting: at high t → trust text; at low t → trust latent
	self.source_weighting = nn.Sequential(
	nn.Linear(self.hidden_dim, 128),
	nn.GELU(),
	nn.Linear(128, 3), # [text, frame, latent] weights
	nn.Softmax(dim=-1)
	)

	# === Routing Head ===

	self.router_head = nn.Sequential(
	nn.Linear(self.hidden_dim, self.hidden_dim),
	nn.GELU(),
	nn.LayerNorm(self.hidden_dim),
	nn.Dropout(params["dropout"]),
	nn.Linear(self.hidden_dim, self.num_experts)
	)

	# Initialize weights
	self.initialize_weights()

	def initialize_weights(self):
	"""Initialize weights following DiT conventions."""
	def _basic_init(module):
	if isinstance(module, nn.Linear):
	torch.nn.init.xavier_uniform_(module.weight)
	if module.bias is not None:
	nn.init.constant_(module.bias, 0)
	elif isinstance(module, nn.Conv3d):
	# Flatten spatial dims for xavier init
	w = module.weight.data
	nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
	if module.bias is not None:
	nn.init.constant_(module.bias, 0)
	self.apply(_basic_init)

	# Initialize timestep embedding MLP (following DiT)
	if hasattr(self.time_embed, 'mlp'):
	nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
	nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)

	# Small non-zero initialization for final routing layer
	# (pure zeros cause uniform outputs that break temperature scaling)
	nn.init.normal_(self.router_head[-1].weight, std=0.01)
	nn.init.constant_(self.router_head[-1].bias, 0)

	# Initialize source weighting to start roughly uniform
	# The softmax will make [0, 0, 0] → [0.33, 0.33, 0.33]
	nn.init.constant_(self.source_weighting[-2].weight, 0)
	nn.init.constant_(self.source_weighting[-2].bias, 0)

	# Initialize temporal attention to uniform attention
	if self.temporal_pool_mode == "attention":
	nn.init.constant_(self.temporal_attn[-1].weight, 0)
	nn.init.constant_(self.temporal_attn[-1].bias, 0)

	def _masked_mean_pool(self, embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""
	Compute masked mean pooling over sequence dimension.

	Args:
	embeddings: [B, seq_len, embed_dim] - Token embeddings
	attention_mask: [B, seq_len] - 1 for real tokens, 0 for padding

	Returns:
	pooled: [B, embed_dim] - Pooled representation
	"""
	if attention_mask is None:
	# No mask provided, use simple mean
	return embeddings.mean(dim=1)

	# Expand mask for broadcasting: [B, seq_len] -> [B, seq_len, 1]
	mask = attention_mask.unsqueeze(-1).to(embeddings.dtype)

	# Masked sum
	masked_sum = (embeddings * mask).sum(dim=1) # [B, embed_dim]

	# Count of valid tokens (avoid division by zero)
	token_counts = mask.sum(dim=1).clamp(min=1.0) # [B, 1]

	return masked_sum / token_counts

	def _encode_latent_temporal(self, x_t: torch.Tensor) -> torch.Tensor:
	"""
	Encode video latent with temporal awareness.

	Extracts both:
	- Content features: What is in the video (spatial)
	- Motion features: How things move (temporal differences)

	Args:
	x_t: [B, C, T, H, W] - Noisy video latent

	Returns:
	latent_feat: [B, hidden_dim] - Combined latent features
	"""
	B, C, T, H, W = x_t.shape

	# 1. Spatial feature extraction
	spatial_feat = self.spatial_conv(x_t) # [B, conv_dim, T, H, W]

	# 2. Temporal feature extraction (captures local motion)
	temporal_feat = self.temporal_conv(spatial_feat) # [B, conv_dim, T, H, W]

	# 3. Combined spatio-temporal processing
	st_feat = self.st_conv(temporal_feat) # [B, conv_dim, T, H, W]

	# 4. Pool spatially, keep temporal: [B, conv_dim, T, 1, 1] -> [B, T, conv_dim]
	pooled = self.spatial_pool(st_feat).squeeze(-1).squeeze(-1) # [B, conv_dim, T]
	pooled = pooled.permute(0, 2, 1) # [B, T, conv_dim]

	# 5. Temporal pooling with optional attention
	if self.temporal_pool_mode == "attention" and T > 1:
	# Learn which frames matter for routing
	attn_scores = self.temporal_attn(pooled).squeeze(-1) # [B, T]
	attn_weights = F.softmax(attn_scores, dim=-1) # [B, T]
	content_feat = (pooled * attn_weights.unsqueeze(-1)).sum(dim=1) # [B, conv_dim]
	elif self.temporal_pool_mode == "max":
	content_feat = pooled.max(dim=1)[0] # [B, conv_dim]
	else: # "avg"
	content_feat = pooled.mean(dim=1) # [B, conv_dim]

	# 6. Extract motion features (frame differences)
	if T > 1:
	# Compute frame-to-frame differences
	frame_diffs = pooled[:, 1:] - pooled[:, :-1] # [B, T-1, conv_dim]

	# Motion magnitude and direction encoding
	motion_feat = self.motion_encoder(frame_diffs.mean(dim=1)) # [B, hidden_dim//2]
	else:
	# Single frame, no motion
	motion_feat = torch.zeros(B, self.hidden_dim // 2, device=x_t.device)

	# 7. Project content features
	content_proj = self.content_proj(content_feat) # [B, hidden_dim//2]

	# 8. Combine content + motion
	combined = torch.cat([content_proj, motion_feat], dim=-1) # [B, hidden_dim]
	latent_feat = self.latent_proj(combined) # [B, hidden_dim]

	return latent_feat

	def forward(self, x_t: torch.Tensor, t: torch.Tensor,
	text_embed: torch.Tensor,
	first_frame_feat: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	temperature: float = 1.0) -> torch.Tensor:
	"""
	Compute routing logits with time-adaptive information weighting.

	Args:
	x_t: Noisy video latent [B, C, T, H, W]
	t: Noise level [B] in [0, 1] or [0, 999]
	text_embed: CLIP text embedding [B, text_embed_dim] or [B, seq_len, text_embed_dim]
	first_frame_feat: Optional DINOv2 features [B, frame_embed_dim]
	attention_mask: Optional [B, seq_len] mask for text (1=valid, 0=padding)
	temperature: Softmax temperature for sharper/softer routing (default: 1.0)

	Returns:
	logits: Expert selection logits [B, num_experts] (scaled by temperature)
	"""
	B = x_t.shape[0]
	device = x_t.device

	# === Encode each information source ===

	# Handle both pooled [B, D] and sequence [B, seq_len, D] text embeddings
	if text_embed.dim() == 3:
	# Use masked mean pooling for sequence embeddings
	text_embed_pooled = self._masked_mean_pool(text_embed, attention_mask)
	else:
	# Already pooled
	text_embed_pooled = text_embed

	# L2-normalize inputs to match clustering preprocessing
	if self.normalize_inputs:
	text_embed_pooled = F.normalize(text_embed_pooled, p=2, dim=-1)

	text_feat = self.text_encoder(text_embed_pooled) # [B, hidden_dim]

	# Frame features (optional for T2V, required for I2V)
	if first_frame_feat is not None:
	# L2-normalize to match clustering preprocessing
	if self.normalize_inputs:
	first_frame_feat = F.normalize(first_frame_feat, p=2, dim=-1)
	frame_feat = self.frame_encoder(first_frame_feat) # [B, hidden_dim]
	else:
	frame_feat = torch.zeros(B, self.hidden_dim, device=device)

	# Latent features from noisy video (temporal-aware encoding)
	latent_feat = self._encode_latent_temporal(x_t) # [B, hidden_dim]

	# === Time-dependent weighting ===

	# Normalize timesteps to [0, 999] for TimestepEmbedder
	if t.max() <= 1.0:
	t_scaled = t * 999.0
	else:
	t_scaled = t
	t_scaled = t_scaled.clamp(0, 999)

	# Get time features
	time_emb = self.time_embed(t_scaled) # [B, hidden_dim]
	time_feat = self.time_mlp(time_emb) # [B, hidden_dim]

	# Compute adaptive weights based on noise level
	# Network learns: high t → high text weight; low t → high latent weight
	weights = self.source_weighting(time_feat) # [B, 3]

	# === Adaptive combination ===

	combined = (
	weights[:, 0:1] * text_feat + # Text contribution
	weights[:, 1:2] * frame_feat + # Frame contribution
	weights[:, 2:3] * latent_feat # Latent contribution
	)

	# Final routing decision (incorporate time context)
	logits = self.router_head(combined + time_feat)

	# Apply temperature scaling (lower temp = sharper routing)
	if temperature != 1.0:
	logits = logits / temperature

	return logits

	def get_source_weights(self, t: torch.Tensor) -> torch.Tensor:
	"""
	Get the learned source weights for given timesteps.
	Useful for debugging and visualization.

	Args:
	t: Noise levels [B] in [0, 1] or [0, 999]

	Returns:
	weights: Source weights [B, 3] for [text, frame, latent]
	"""
	# Normalize timesteps
	if t.max() <= 1.0:
	t_scaled = t * 999.0
	else:
	t_scaled = t
	t_scaled = t_scaled.clamp(0, 999)

	time_emb = self.time_embed(t_scaled)
	time_feat = self.time_mlp(time_emb)
	weights = self.source_weighting(time_feat)

	return weights

	# =============================================================================
	# MODEL FACTORY FUNCTIONS
	# =============================================================================

	def create_expert(config, expert_id: Optional[int] = None) -> nn.Module:
	"""
	Factory function to create expert model

	Args:
	config: Config object
	expert_id: Optional expert ID for per-expert schedule/objective configuration
	"""
	# Make a copy of config to avoid modifying the original
	import copy
	config = copy.copy(config)
	config.expert_params = config.expert_params.copy()

	# Inject schedule_type into expert_params if not already present
	if "schedule_type" not in config.expert_params:
	# Check for per-expert schedule first (with backward compatibility)
	if (hasattr(config, 'expert_schedule_types') and
	config.expert_schedule_types and
	expert_id is not None and
	expert_id in config.expert_schedule_types):
	config.expert_params["schedule_type"] = config.expert_schedule_types[expert_id]
	else:
	# Use default schedule_type (with fallback for old configs)
	config.expert_params["schedule_type"] = getattr(config, 'schedule_type', 'linear_interp')

	# Inject objective_type into expert_params if not already present
	if "objective_type" not in config.expert_params:
	# Check for per-expert objectives (with backward compatibility)
	if (hasattr(config, 'expert_objectives') and
	config.expert_objectives and
	expert_id is not None and
	expert_id in config.expert_objectives):
	config.expert_params["objective_type"] = config.expert_objectives[expert_id]
	else:
	# Use default objective (with fallback for old configs)
	config.expert_params["objective_type"] = getattr(config, 'default_objective', 'fm')

	if config.expert_architecture == "unet":
	return UNetExpert(config)
	elif config.expert_architecture == "simple_cnn":
	return SimpleCNNExpert(config)
	elif config.expert_architecture == "dit":
	return DiTExpert(config)
	else:
	raise ValueError(f"Unknown expert architecture: {config.expert_architecture}")

	def create_router(config) -> Optional[nn.Module]:
	"""Factory function to create router model"""

	if config.router_architecture == "none" or config.is_monolithic:
	return None
	elif config.router_architecture == "deterministic_timestep":
	return DeterministicTimestepRouter(config)
	elif config.router_architecture == "vit":
	return ViTRouter(config)
	elif config.router_architecture == "cnn":
	return CNNRouter(config)
	elif config.router_architecture == "dit":
	return DiTRouter(config)
	elif config.router_architecture == "adaptive_video":
	return AdaptiveVideoRouter(config)
	else:
	raise ValueError(f"Unknown router architecture: {config.router_architecture}")