UltraThinking-LLM-Training / src /models /moe_advanced.py

Upload folder using huggingface_hub

54c5666 verified 4 months ago

30.2 kB

	"""
	Advanced Mixture of Experts (MoE³) Architecture
	Implements hierarchical, consultative experts with domain specialization
	"""

	import math
	import time
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	# Safe import for torch.compile disable
	try:
	from torch._dynamo import disable as torchdynamo_disable
	except Exception: # pragma: no cover
	def torchdynamo_disable(fn):
	return fn
	from typing import Dict, List, Tuple, Optional, Any
	from dataclasses import dataclass
	from enum import Enum
	import math
	import logging

	logger = logging.getLogger(__name__)


	class ExpertType(Enum):
	"""Types of experts in the hierarchy"""
	KNOWLEDGE = "knowledge" # Domain knowledge experts
	SKILL = "skill" # Task-specific skill experts
	META = "meta" # Meta-reasoning experts
	SAFETY = "safety" # Safety and alignment experts


	@dataclass
	class ExpertConfig:
	"""Configuration for expert modules"""
	num_knowledge_experts: int = 64
	num_skill_experts: int = 32
	num_meta_experts: int = 16
	num_safety_experts: int = 8
	expert_capacity: float = 1.25 # Capacity factor for load balancing
	expert_dropout: float = 0.1

	# OPTIMIZED: Load balancing loss weights for stable 50% max expert usage
	# These weights are carefully tuned to maintain balance without causing instability
	load_balance_weight: float = 0.01 # Switch Transformers load balancing
	z_loss_weight: float = 0.001 # Router logit regularization (prevent extremes)
	importance_weight: float = 0.005 # Routing diversity (reduced for stability)
	entropy_reg_weight: float = 0.5 # Entropy regularization (gentler than before)
	aux_loss_weight: float = 0.01 # Legacy - kept for compatibility

	top_k: int = 2 # Number of experts to route to (50% max expert for k=2)
	moe_dtype: torch.dtype = torch.float32
	expert_parallelism: bool = True
	consultative_attention: bool = True
	hierarchical_routing: bool = True


	class NoisyTopKRouter(nn.Module):
	"""Noisy Top-K routing with load balancing"""

	def __init__(
	self,
	hidden_dim: int,
	num_experts: int,
	top_k: int = 2,
	noise_std: float = 1.0,
	warmup_steps: int = 100
	):
	super().__init__()
	self.hidden_dim = hidden_dim
	self.num_experts = num_experts
	self.top_k = top_k
	self.noise_std = noise_std
	self.warmup_steps = warmup_steps

	# Track forward passes for warmup
	self.register_buffer('_forward_count', torch.tensor(0, dtype=torch.long))

	# Router network - use float32
	# Router gate with BALANCED initialization for immediate load balancing
	self.gate = nn.Linear(hidden_dim, num_experts, bias=True, dtype=torch.float32)

	# CRITICAL: Initialize router to start nearly uniform from step 0
	with torch.no_grad():
	# ZERO weights initially - purely noise-driven routing at start
	self.gate.weight.zero_()
	# Zero bias - all experts equally likely before learning
	self.gate.bias.zero_()

	# Learnable noise parameters
	self.noise_linear = nn.Linear(hidden_dim, num_experts, dtype=torch.float32)
	# Initialize noise layer with small weights
	with torch.no_grad():
	self.noise_linear.weight.zero_() # Start with pure random routing

	@torchdynamo_disable
	def forward(
	self,
	hidden_states: torch.Tensor,
	training: bool = True
	) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
	"""
	COMPLETELY REWRITTEN ROUTER with proven balanced routing
	Returns: (dispatch_mask, combine_weights, aux_losses)
	"""
	batch_size, seq_len, hidden_dim = hidden_states.shape
	hidden_states_flat = hidden_states.view(-1, hidden_dim) # [B*S, H]
	original_dtype = hidden_states.dtype

	# Increment forward counter
	if training:
	self._forward_count += 1

	# CRITICAL FIX: Force balanced routing with warmup period
	device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
	with torch.amp.autocast(device_type=device_type, enabled=False):
	hs_fp32 = hidden_states_flat.to(dtype=torch.float32)

	# Calculate warmup factor (1.0 at start, 0.0 after warmup)
	warmup_factor = max(0.0, 1.0 - self._forward_count.item() / max(self.warmup_steps, 1))

	# AGGRESSIVE WARMUP FIX: Force truly uniform distribution during early steps
	if training and warmup_factor > 0.5:
	# During first 50 steps: PURE UNIFORM + small random noise
	# This guarantees balanced routing from step 0
	uniform_scores = torch.ones_like(
	torch.zeros(batch_size * seq_len, self.num_experts, device=hs_fp32.device)
	) / self.num_experts

	# Add tiny random perturbations to break ties
	random_noise = torch.randn_like(uniform_scores) * 0.01
	scores = uniform_scores + random_noise
	scores = F.softmax(scores / 0.1, dim=-1) # Low temp for nearly uniform

	else:
	# After warmup threshold: use learned routing
	raw_logits = self.gate(hs_fp32) # [B*S, E]

	if training and warmup_factor > 0:
	# Gradual transition phase (steps 50-100)
	gumbel_noise = -torch.log(-torch.log(torch.rand_like(raw_logits) + 1e-10) + 1e-10)
	temperature = 5.0 * warmup_factor + 1.0 * (1 - warmup_factor)
	logits = (raw_logits * (1 - warmup_factor) + gumbel_noise) / temperature
	elif training:
	# After warmup: normal routing with exploration
	gumbel_noise = -torch.log(-torch.log(torch.rand_like(raw_logits) + 1e-10) + 1e-10)
	temperature = max(1.0, 8.0 / self.num_experts)
	logits = (raw_logits + gumbel_noise * 0.3) / temperature
	else:
	logits = raw_logits

	# Compute softmax probabilities
	scores = F.softmax(logits, dim=-1)

	# ADAPTIVE: Minimum probability scales with expert count
	# For 8 experts: min_prob = 0.05 (5%), for 64 experts: min_prob = 0.006 (0.6%)
	min_prob = max(0.001, 0.4 / self.num_experts)
	scores = scores * (1 - min_prob * self.num_experts) + min_prob
	scores = scores / scores.sum(dim=-1, keepdim=True) # Renormalize

	# Top-k selection with actual_top_k
	actual_top_k = min(self.top_k, self.num_experts)
	top_k_scores, top_k_indices = torch.topk(scores, actual_top_k, dim=-1)

	# Renormalize and cast back
	top_k_scores = top_k_scores / top_k_scores.sum(dim=-1, keepdim=True)
	top_k_scores = top_k_scores.to(original_dtype)

	# Create dispatch mask [B*S, E, K] - use actual_top_k
	dispatch_mask = torch.zeros(
	batch_size * seq_len, self.num_experts, actual_top_k,
	dtype=torch.bool, device=hidden_states.device
	)

	# Create combine weights [B*S, E, K] - use actual_top_k
	combine_weights = torch.zeros(
	batch_size * seq_len, self.num_experts, actual_top_k,
	dtype=hidden_states.dtype, device=hidden_states.device
	)

	# Fill dispatch mask and weights
	for k in range(actual_top_k):
	expert_idx = top_k_indices[:, k] # [B*S]
	dispatch_mask[torch.arange(batch_size * seq_len), expert_idx, k] = True
	combine_weights[torch.arange(batch_size * seq_len), expert_idx, k] = top_k_scores[:, k]

	# Compute auxiliary losses for load balancing
	aux_losses = self._compute_aux_losses(scores, dispatch_mask)

	# Reshape back
	dispatch_mask = dispatch_mask.view(batch_size, seq_len, self.num_experts, actual_top_k)
	combine_weights = combine_weights.view(batch_size, seq_len, self.num_experts, actual_top_k)

	return dispatch_mask, combine_weights, aux_losses

	def _compute_aux_losses(
	self,
	scores: torch.Tensor,
	dispatch_mask: torch.Tensor
	) -> Dict[str, torch.Tensor]:
	"""
	OPTIMIZED: Compute stable load balancing losses
	Ensures max_exp stays at 50% for top_k=2
	"""
	device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
	with torch.amp.autocast(device_type=device_type, enabled=False):
	scores_fp32 = scores.to(torch.float32)
	dispatch_fp32 = dispatch_mask.float()

	# Expert usage based on actual routing decisions
	expert_usage = dispatch_fp32.sum(dim=(0, 2)) # [E]
	total_tokens = expert_usage.sum() + 1e-10
	expert_usage_normalized = expert_usage / total_tokens

	# Target: For top_k=2, ideal is 2/num_experts per expert (50% max when k=2)
	target_usage = (self.top_k / self.num_experts)
	uniform_target = torch.full_like(expert_usage_normalized, target_usage)

	# Switch Transformer style load balancing loss (more stable)
	# P(expert) * f(expert) where P is gate prob, f is fraction routed
	gate_probs = scores_fp32.mean(dim=0) # [E]
	fraction_routed = expert_usage_normalized # [E]

	# Load loss: minimize variance in expert*fraction product
	load_loss = (gate_probs * fraction_routed).sum() * self.num_experts
	load_loss = load_loss.clamp(min=0.0, max=10.0) # Prevent explosion

	# Importance loss: encourage routing diversity
	importance = scores_fp32.mean(dim=0) # [E]
	importance_variance = torch.var(importance, unbiased=False)
	importance_mean = torch.mean(importance)
	importance_loss = importance_variance / (importance_mean ** 2 + 1e-10)
	importance_loss = importance_loss.clamp(min=0.0, max=1.0)

	# Entropy regularization - CRITICAL for small expert counts
	routing_entropy = -torch.sum(scores_fp32 * torch.log(scores_fp32 + 1e-10), dim=-1).mean()
	max_entropy = torch.log(torch.tensor(float(self.num_experts), device=scores.device, dtype=torch.float32))

	# Normalized entropy (0 to 1, where 1 is perfect balance)
	normalized_entropy = routing_entropy / (max_entropy + 1e-10)

	# ADAPTIVE: Stronger penalty for smaller expert counts
	# Small pools collapse more easily, need stronger regularization
	entropy_strength = min(10.0, 20.0 / self.num_experts)
	# Loss increases as entropy decreases (encourage high entropy)
	entropy_reg_loss = (1.0 - normalized_entropy) * entropy_strength
	entropy_reg_loss = entropy_reg_loss.clamp(min=0.0, max=10.0)

	# Z-loss: router logit regularization (prevent extreme values)
	router_logits_squared = torch.logsumexp(scores_fp32, dim=-1) ** 2
	z_loss = router_logits_squared.mean()
	z_loss = z_loss.clamp(min=0.0, max=100.0)

	# Expert usage entropy (measure balance across experts)
	expert_entropy = -torch.sum(
	expert_usage_normalized * torch.log(expert_usage_normalized + 1e-10)
	)

	return {
	'load_loss': load_loss.to(scores.dtype),
	'importance_loss': importance_loss.to(scores.dtype),
	'z_loss': z_loss.to(scores.dtype),
	'entropy_reg_loss': entropy_reg_loss.to(scores.dtype),
	'expert_usage': expert_usage_normalized,
	'routing_entropy': expert_entropy
	}


	class Expert(nn.Module):
	"""Individual expert module"""

	def __init__(
	self,
	hidden_dim: int,
	intermediate_dim: int,
	dropout: float = 0.1,
	activation: str = 'gelu',
	specialization: Optional[str] = None
	):
	super().__init__()

	self.specialization = specialization

	# Expert MLP
	self.fc1 = nn.Linear(hidden_dim, intermediate_dim)
	self.fc2 = nn.Linear(intermediate_dim, hidden_dim)
	self.dropout = nn.Dropout(dropout)

	# Activation
	self.activation = nn.GELU() if activation == 'gelu' else nn.ReLU()

	# Specialization-specific components
	if specialization:
	self.specialization_layer = nn.Linear(hidden_dim, hidden_dim)
	self.specialization_norm = nn.LayerNorm(hidden_dim)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	"""Expert forward pass"""
	# Main expert computation
	hidden = self.fc1(hidden_states)
	hidden = self.activation(hidden)
	hidden = self.dropout(hidden)
	output = self.fc2(hidden)

	# Apply specialization if present
	if self.specialization:
	specialized = self.specialization_layer(output)
	specialized = self.specialization_norm(specialized)
	output = output + specialized

	return output


	class CrossExpertAttention(nn.Module):
	"""Attention mechanism for expert consultation"""

	def __init__(
	self,
	hidden_dim: int,
	num_heads: int = 8,
	dropout: float = 0.1
	):
	super().__init__()

	self.hidden_dim = hidden_dim
	self.num_heads = num_heads
	self.head_dim = hidden_dim // num_heads

	self.q_proj = nn.Linear(hidden_dim, hidden_dim)
	self.k_proj = nn.Linear(hidden_dim, hidden_dim)
	self.v_proj = nn.Linear(hidden_dim, hidden_dim)
	self.o_proj = nn.Linear(hidden_dim, hidden_dim)

	self.dropout = nn.Dropout(dropout)

	def forward(
	self,
	expert_outputs: List[torch.Tensor],
	weights: Optional[torch.Tensor] = None
	) -> torch.Tensor:
	"""Allow experts to attend to each other"""
	if len(expert_outputs) == 0:
	return torch.zeros_like(expert_outputs[0])

	# Stack expert outputs [num_experts, batch, seq, hidden]
	stacked = torch.stack(expert_outputs, dim=0)
	num_experts, batch_size, seq_len, hidden_dim = stacked.shape

	# Reshape for attention [batch, num_experts * seq, hidden]
	reshaped = stacked.permute(1, 0, 2, 3).reshape(
	batch_size, num_experts * seq_len, hidden_dim
	)

	# Compute attention
	Q = self.q_proj(reshaped).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
	K = self.k_proj(reshaped).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
	V = self.v_proj(reshaped).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

	# Attention scores
	scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

	# Apply weights if provided
	if weights is not None:
	# Expand weights to match attention shape
	weights_expanded = weights.unsqueeze(1).expand_as(scores)
	scores = scores * weights_expanded

	# Attention weights
	attn_weights = F.softmax(scores, dim=-1)
	attn_weights = self.dropout(attn_weights)

	# Apply attention
	attn_output = torch.matmul(attn_weights, V)
	attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, hidden_dim)

	# Output projection and reshape back
	output = self.o_proj(attn_output)
	output = output.view(batch_size, num_experts, seq_len, hidden_dim)
	output = output.mean(dim=1) # Average over experts

	return output


	class HierarchicalMoE(nn.Module):
	"""Hierarchical Mixture of Experts with multiple levels"""

	def __init__(self, config: ExpertConfig, hidden_dim: int, intermediate_dim: int):
	super().__init__()

	self.config = config
	self.hidden_dim = hidden_dim

	# Knowledge experts (domain-specific)
	self.knowledge_experts = nn.ModuleList([
	Expert(
	hidden_dim, intermediate_dim,
	dropout=config.expert_dropout,
	specialization=f"knowledge_{i}"
	)
	for i in range(config.num_knowledge_experts)
	])

	# Skill experts (task-specific)
	self.skill_experts = nn.ModuleList([
	Expert(
	hidden_dim, intermediate_dim,
	dropout=config.expert_dropout,
	specialization=f"skill_{i}"
	)
	for i in range(config.num_skill_experts)
	])

	# Meta experts (reasoning and planning)
	self.meta_experts = nn.ModuleList([
	Expert(
	hidden_dim, intermediate_dim,
	dropout=config.expert_dropout,
	specialization=f"meta_{i}"
	)
	for i in range(config.num_meta_experts)
	])

	# Safety experts (alignment and safety)
	self.safety_experts = nn.ModuleList([
	Expert(
	hidden_dim, intermediate_dim,
	dropout=config.expert_dropout,
	specialization="safety"
	)
	for i in range(config.num_safety_experts)
	])

	# Routers for each level with warmup for balanced initialization
	self.knowledge_router = NoisyTopKRouter(
	hidden_dim, config.num_knowledge_experts, config.top_k, warmup_steps=100
	)
	self.skill_router = NoisyTopKRouter(
	hidden_dim, config.num_skill_experts, config.top_k, warmup_steps=100
	)
	self.meta_router = NoisyTopKRouter(
	hidden_dim, config.num_meta_experts, config.top_k, warmup_steps=100
	)
	self.safety_router = NoisyTopKRouter(
	hidden_dim, config.num_safety_experts, min(config.top_k, config.num_safety_experts), warmup_steps=100
	)

	# Cross-expert attention for consultation
	if config.consultative_attention:
	self.cross_expert_attention = CrossExpertAttention(hidden_dim)

	# Hierarchical combination
	if config.hierarchical_routing:
	self.hierarchy_combiner = nn.Sequential(
	nn.Linear(hidden_dim * 4, hidden_dim * 2),
	nn.ReLU(),
	nn.Dropout(config.expert_dropout),
	nn.Linear(hidden_dim * 2, hidden_dim)
	)

	def _route_and_compute(
	self,
	hidden_states: torch.Tensor,
	experts: nn.ModuleList,
	router: NoisyTopKRouter,
	expert_type: str
	) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
	"""Route to experts and compute output"""
	batch_size, seq_len, hidden_dim = hidden_states.shape

	# Get routing decisions
	dispatch_mask, combine_weights, aux_losses = router(hidden_states, self.training)

	# Initialize output
	expert_output = torch.zeros_like(hidden_states)

	# Process each expert
	for expert_idx, expert in enumerate(experts):
	# Get tokens routed to this expert
	expert_mask = dispatch_mask[:, :, expert_idx, :].any(dim=-1) # [B, S]

	if expert_mask.any():
	# Get input for this expert
	expert_input = hidden_states[expert_mask]

	# Compute expert output
	expert_result = expert(expert_input.unsqueeze(0)).squeeze(0)

	# Get weights for this expert
	expert_weights = combine_weights[:, :, expert_idx, :].sum(dim=-1) # [B, S]
	expert_weights = expert_weights[expert_mask]

	# Weighted accumulation
	expert_output[expert_mask] += expert_result * expert_weights.unsqueeze(-1)

	# Add auxiliary losses with type prefix
	prefixed_losses = {f"{expert_type}_{k}": v for k, v in aux_losses.items()}

	return expert_output, prefixed_losses

	def forward(
	self,
	hidden_states: torch.Tensor,
	return_all_levels: bool = False
	) -> Tuple[torch.Tensor, Dict[str, Any]]:
	"""Forward pass through hierarchical MoE"""

	all_aux_losses = {}
	level_outputs = {}

	# Level 1: Knowledge experts
	knowledge_output, knowledge_losses = self._route_and_compute(
	hidden_states, self.knowledge_experts, self.knowledge_router, "knowledge"
	)
	level_outputs['knowledge'] = knowledge_output
	all_aux_losses.update(knowledge_losses)

	# Level 2: Skill experts (can see knowledge expert output)
	skill_input = hidden_states + 0.5 * knowledge_output # Residual connection
	skill_output, skill_losses = self._route_and_compute(
	skill_input, self.skill_experts, self.skill_router, "skill"
	)
	level_outputs['skill'] = skill_output
	all_aux_losses.update(skill_losses)

	# Level 3: Meta experts (can see both knowledge and skill)
	meta_input = hidden_states + 0.3 * knowledge_output + 0.3 * skill_output
	meta_output, meta_losses = self._route_and_compute(
	meta_input, self.meta_experts, self.meta_router, "meta"
	)
	level_outputs['meta'] = meta_output
	all_aux_losses.update(meta_losses)

	# Level 4: Safety experts (see all previous levels)
	safety_input = hidden_states + 0.2 * (knowledge_output + skill_output + meta_output)
	safety_output, safety_losses = self._route_and_compute(
	safety_input, self.safety_experts, self.safety_router, "safety"
	)
	level_outputs['safety'] = safety_output
	all_aux_losses.update(safety_losses)

	# Consultative attention between expert outputs
	if self.config.consultative_attention and hasattr(self, 'cross_expert_attention'):
	expert_outputs_list = [knowledge_output, skill_output, meta_output, safety_output]
	consulted_output = self.cross_expert_attention(expert_outputs_list)
	level_outputs['consulted'] = consulted_output

	# Hierarchical combination
	if self.config.hierarchical_routing:
	combined = torch.cat([
	knowledge_output,
	skill_output,
	meta_output,
	safety_output
	], dim=-1)
	final_output = self.hierarchy_combiner(combined)
	else:
	# Simple weighted combination
	final_output = (
	0.3 * knowledge_output +
	0.3 * skill_output +
	0.2 * meta_output +
	0.2 * safety_output
	)

	# Add residual connection
	final_output = hidden_states + final_output

	# Calculate expert utilization metrics
	expert_utilization = {}
	total_routing_entropy = 0.0

	for expert_type in ['knowledge', 'skill', 'meta', 'safety']:
	if f"{expert_type}_expert_usage" in all_aux_losses:
	usage = all_aux_losses[f"{expert_type}_expert_usage"]
	entropy = all_aux_losses[f"{expert_type}_routing_entropy"]

	# Per-expert utilization percentages
	expert_utilization[f"{expert_type}_usage_pct"] = (usage * 100).tolist()

	# Load variance (how balanced the routing is)
	if usage.numel() > 1:
	expert_utilization[f"{expert_type}_load_variance"] = float(torch.var(usage, unbiased=False))
	else:
	expert_utilization[f"{expert_type}_load_variance"] = 0.0

	# Top expert concentration (what % goes to most used expert)
	expert_utilization[f"{expert_type}_top_expert_pct"] = float(torch.max(usage) * 100)

	# Routing entropy (higher = more balanced)
	expert_utilization[f"{expert_type}_entropy"] = float(entropy)
	total_routing_entropy += entropy

	# Overall MoE health metrics
	expert_utilization['total_routing_entropy'] = float(total_routing_entropy)
	expert_utilization['avg_routing_entropy'] = float(total_routing_entropy / 4) # 4 expert types

	# Prepare info dictionary
	info = {
	'aux_losses': all_aux_losses,
	'level_outputs': level_outputs if return_all_levels else None,
	'num_experts_used': {
	'knowledge': self.config.num_knowledge_experts,
	'skill': self.config.num_skill_experts,
	'meta': self.config.num_meta_experts,
	'safety': self.config.num_safety_experts
	},
	'expert_utilization': expert_utilization
	}

	return final_output, info


	class MoELayer(nn.Module):

	def __init__(
	self,
	config: ExpertConfig,
	hidden_dim: int,
	intermediate_dim: Optional[int] = None
	):
	super().__init__()

	self.config = config
	self.hidden_dim = hidden_dim
	self.intermediate_dim = intermediate_dim or hidden_dim * 4

	# Layer normalization
	self.input_norm = nn.LayerNorm(hidden_dim)
	self.post_moe_norm = nn.LayerNorm(hidden_dim)

	# Hierarchical MoE
	self.moe = HierarchicalMoE(config, hidden_dim, self.intermediate_dim)

	# Dropout
	self.dropout = nn.Dropout(config.expert_dropout)

	def forward(
	self,
	hidden_states: torch.Tensor,
	return_aux_loss: bool = True
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""Forward pass through MoE layer"""

	# Normalize input
	normed_hidden_states = self.input_norm(hidden_states)

	# MoE forward pass
	moe_output, moe_info = self.moe(normed_hidden_states)

	# Store MoE info for later access
	self.last_moe_info = moe_info

	# Apply dropout
	moe_output = self.dropout(moe_output)

	# Post normalization
	output = self.post_moe_norm(moe_output)

	# Calculate auxiliary loss
	aux_loss = None
	if return_aux_loss and moe_info.get('aux_losses'):
	aux_losses = moe_info['aux_losses']

	# Combine all auxiliary losses with proper weighting
	total_aux_loss = torch.tensor(0.0, device=hidden_states.device)

	# Load balancing losses (Switch Transformers approach)
	load_balance_weight = getattr(self.config, 'load_balance_weight', 0.01)
	z_loss_weight = getattr(self.config, 'z_loss_weight', 0.001)
	importance_weight = getattr(self.config, 'importance_weight', 0.01)

	for key, loss in aux_losses.items():
	if 'load_loss' in key:
	# Primary load balancing loss - encourages uniform expert usage
	total_aux_loss += load_balance_weight * loss
	elif 'z_loss' in key:
	# Z-loss - prevents router logits from becoming too large
	total_aux_loss += z_loss_weight * loss
	elif 'importance_loss' in key:
	# Importance loss - encourages diversity in routing decisions
	total_aux_loss += importance_weight * loss
	elif 'entropy_reg_loss' in key:
	# NUCLEAR FIX: Direct entropy regularization
	entropy_weight = getattr(self.config, 'entropy_reg_weight', 1.0)
	total_aux_loss += entropy_weight * loss

	aux_loss = total_aux_loss

	return output, aux_loss


	def create_moe_layers(
	num_layers: int,
	config: ExpertConfig,
	hidden_dim: int,
	sparse_layers: Optional[List[int]] = None
	) -> nn.ModuleList:
	"""Create MoE layers for transformer"""

	if sparse_layers is None:
	# Default: every other layer is MoE
	sparse_layers = list(range(1, num_layers, 2))

	layers = nn.ModuleList()

	for layer_idx in range(num_layers):
	if layer_idx in sparse_layers:
	# MoE layer
	layer = MoELayer(config, hidden_dim)
	else:
	# Regular FFN layer (placeholder - would use actual FFN)
	layer = nn.Sequential(
	nn.LayerNorm(hidden_dim),
	nn.Linear(hidden_dim, hidden_dim * 4),
	nn.GELU(),
	nn.Linear(hidden_dim * 4, hidden_dim),
	nn.Dropout(config.expert_dropout)
	)

	layers.append(layer)

	return layers