Upload 3 files

Browse files

Files changed (3) hide show

moe_config.py +119 -0
moe_layers.py +323 -0
moe_model.py +460 -0

moe_config.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+HuggingFace-compatible MoE Configuration
+Basierend auf dem nanoMoE Blog Post
+"""
+from transformers import PretrainedConfig
+class MoEGPTConfig(PretrainedConfig):
+    """
+    Konfiguration für MoE-basiertes GPT Modell.
+    Args:
+        vocab_size (int): Größe des Vokabulars
+        n_positions (int): Maximale Sequenzlänge
+        n_embd (int): Dimensionalität der Embeddings (d im Blog)
+        n_layer (int): Anzahl der Transformer Blocks
+        n_head (int): Anzahl der Attention Heads
+        n_experts (int): Anzahl der Experten pro MoE Layer
+        n_experts_active (int): Anzahl aktiver Experten (top-k)
+        moe_layer_frequency (int): Jede n-te Layer wird zu MoE (P im Blog)
+        capacity_factor (float): Expert Capacity Factor für Training
+        eval_capacity_factor (float): Expert Capacity Factor für Evaluation
+        use_noisy_gating (bool): Ob Noisy Top-k Gating verwendet werden soll
+        aux_loss_alpha (float): Skalierung für Load Balancing Loss
+        router_z_loss_alpha (float): Skalierung für Router Z-Loss
+        bias (bool): Ob Bias in Linear Layers verwendet werden soll
+        dropout (float): Dropout Probability
+        activation_function (str): Aktivierungsfunktion (gelu, relu, swiglu)
+        initializer_range (float): Standard Deviation für Weight Initialization
+        layer_norm_epsilon (float): Epsilon für Layer Normalization
+    """
+    model_type = "moe_gpt"
+    def __init__(
+        self,
+        vocab_size=128256,  # Llama 3.2 tokenizer (inkl. special tokens)
+        n_positions=2048,  # Default 2048 für RoPE
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_experts=8,
+        n_experts_active=2,
+        moe_layer_frequency=2,
+        capacity_factor=1.25,
+        eval_capacity_factor=2.0,
+        use_noisy_gating=True,
+        aux_loss_alpha=0.01,
+        router_z_loss_alpha=0.001,
+        bias=False,
+        dropout=0.1,
+        activation_function="gelu",
+        initializer_range=0.1,
+        layer_norm_epsilon=1e-5,
+        use_cache=True,
+        rope_theta=10000.0,  # RoPE base theta
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_experts = n_experts
+        self.n_experts_active = n_experts_active
+        self.moe_layer_frequency = moe_layer_frequency
+        self.capacity_factor = capacity_factor
+        self.eval_capacity_factor = eval_capacity_factor
+        self.use_noisy_gating = use_noisy_gating
+        self.aux_loss_alpha = aux_loss_alpha
+        self.router_z_loss_alpha = router_z_loss_alpha
+        self.bias = bias
+        self.dropout = dropout
+        self.activation_function = activation_function
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        # HuggingFace Standard Attribute (für .generate())
+        self.num_hidden_layers = n_layer
+        self.hidden_size = n_embd
+        self.num_attention_heads = n_head
+        self.max_position_embeddings = n_positions
+        # Validierung
+        assert n_embd % n_head == 0, "n_embd muss durch n_head teilbar sein"
+        assert n_experts_active <= n_experts, "n_experts_active darf nicht größer als n_experts sein"
+        assert moe_layer_frequency >= 1, "moe_layer_frequency muss mindestens 1 sein"
+    @property
+    def head_dim(self):
+        """Dimension pro Attention Head"""
+        return self.n_embd // self.n_head
+    @property
+    def total_experts(self):
+        """Gesamtanzahl der Experten im Modell"""
+        num_moe_layers = sum(1 for i in range(self.n_layer) if i % self.moe_layer_frequency == 0)
+        return num_moe_layers * self.n_experts
+    @property
+    def active_parameters_ratio(self):
+        """Ratio der aktiven Parameter (ungefähr)"""
+        num_moe_layers = sum(1 for i in range(self.n_layer) if i % self.moe_layer_frequency == 0)
+        num_dense_layers = self.n_layer - num_moe_layers
+        # Vereinfachte Schätzung (ignoriert Attention)
+        dense_params = num_dense_layers * (8 * self.n_embd**2)  # FFN params
+        moe_total_params = num_moe_layers * self.n_experts * (8 * self.n_embd**2)
+        moe_active_params = num_moe_layers * self.n_experts_active * (8 * self.n_embd**2)
+        total = dense_params + moe_total_params
+        active = dense_params + moe_active_params
+        return active / total if total > 0 else 1.0

moe_layers.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+MoE Layer Komponenten
+Basierend auf dem nanoMoE Blog Post und HuggingFace Best Practices
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+class MoERouter(nn.Module):
+    """
+    Noisy Top-k Router für MoE.
+    Routet Tokens zu den Top-k Experten basierend auf gelernten Wahrscheinlichkeiten.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_experts: int,
+        n_experts_active: int,
+        use_noisy_gating: bool = True,
+        capacity_factor: float = 1.25,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_experts = n_experts
+        self.n_experts_active = n_experts_active
+        self.use_noisy_gating = use_noisy_gating
+        self.capacity_factor = capacity_factor
+        # Linear projections für Router (kein Bias, siehe Shazeer et al. 2017)
+        self.w_gate = nn.Linear(d_model, n_experts, bias=False)
+        self.w_noise = nn.Linear(d_model, n_experts, bias=False) if use_noisy_gating else None
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: Input tensor [batch_size, seq_len, d_model]
+        Returns:
+            expert_weights: Gewichte für jeden Experten [batch_size * seq_len, n_experts, capacity]
+            expert_mask: Maske für verwendete Experten [batch_size * seq_len, n_experts, capacity]
+            expert_batches: Batches für jeden Experten [n_experts, capacity, d_model]
+            router_logits: Router Logits für z-loss [batch_size, seq_len, n_experts]
+        """
+        batch_size, seq_len, d_model = x.shape
+        num_tokens = batch_size * seq_len
+        # Router läuft IMMER in FP32 für numerische Stabilität!
+        device_type = "cuda" if x.is_cuda else "cpu"
+        with torch.amp.autocast(device_type=device_type, enabled=False):
+            x_fp32 = x.float()
+            # Router Logits berechnen
+            router_logits = self.w_gate(x_fp32)  # [B, T, n_experts]
+            # Noisy Top-k Gating (optional)
+            if self.use_noisy_gating and self.training:
+                noise = F.softplus(self.w_noise(x_fp32))
+                noise = noise * torch.randn_like(noise)
+                router_logits = router_logits + noise
+            # Top-k Experten auswählen
+            top_k_logits, top_k_indices = router_logits.topk(
+                self.n_experts_active, dim=-1
+            )  # [B, T, K]
+            # Softmax über alle Experten (nicht nur Top-k)
+            router_probs = torch.full_like(router_logits, float("-inf"))
+            router_probs.scatter_(-1, top_k_indices, top_k_logits)
+            router_probs = F.softmax(router_probs, dim=-1)  # [B, T, n_experts]
+            # Expert Capacity berechnen
+            capacity = self._compute_capacity(num_tokens)
+            # Multi-hot Maske der gewählten Experten
+            expert_mask = F.one_hot(
+                top_k_indices, num_classes=self.n_experts
+            )  # [B, T, K, n_experts]
+            expert_mask = expert_mask.view(num_tokens, self.n_experts_active, self.n_experts)
+            expert_mask = expert_mask.permute(1, 0, 2)  # [K, num_tokens, n_experts]
+            # Position jedes Tokens im Expert Batch (cumsum für Top-1 first prioritization)
+            expert_rank = expert_mask.reshape(
+                self.n_experts_active * num_tokens, self.n_experts
+            )
+            expert_rank = torch.cumsum(expert_rank, dim=0) - 1
+            expert_rank = expert_rank.reshape(
+                self.n_experts_active, num_tokens, self.n_experts
+            )
+            # Tokens über Kapazität hinaus maskieren
+            expert_mask = expert_mask * torch.lt(expert_rank, capacity)
+            # Position im Expert Batch
+            expert_rank = torch.sum(expert_mask * expert_rank, dim=-1)  # [K, num_tokens]
+            # Wahrscheinlichkeiten mit Maske multiplizieren
+            router_probs = router_probs.view(num_tokens, self.n_experts)[
+                None, :
+            ]  # [1, num_tokens, n_experts]
+            expert_weights = expert_mask * router_probs  # [K, num_tokens, n_experts]
+            # One-hot für Position in Expert Batch
+            expert_rank_one_hot = F.one_hot(
+                expert_rank, num_classes=capacity
+            )  # [K, num_tokens, capacity]
+            # Gewichte an Expert Batch Position
+            expert_weights = torch.sum(
+                expert_weights.unsqueeze(3) * expert_rank_one_hot.unsqueeze(2), dim=0
+            )  # [num_tokens, n_experts, capacity]
+            expert_mask = expert_weights.bool()
+            # Expert Batches erstellen
+            x_flat = x.view(num_tokens, d_model)
+            expert_batches = (
+                expert_mask.permute(1, 2, 0).type_as(x) @ x_flat
+            )  # [n_experts, capacity, d_model]
+        return expert_weights, expert_mask, expert_batches, router_logits
+    def _compute_capacity(self, num_tokens: int) -> int:
+        """Berechnet Expert Capacity"""
+        capacity = math.floor(
+            self.n_experts_active * self.capacity_factor * num_tokens / self.n_experts
+        )
+        capacity += capacity % 2  # Gerade Zahl für bessere Hardware-Nutzung
+        return max(int(capacity), 2)  # Minimum 2 für kleine Batches
+class ExpertMLP(nn.Module):
+    """
+    Batch von MLP Experten.
+    Alle Experten haben die gleiche Architektur, aber unabhängige Gewichte.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_experts: int,
+        bias: bool = False,
+        dropout: float = 0.1,
+        activation: str = "gelu",
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_experts = n_experts
+        self.bias = bias
+        # 4x hidden dimension (Standard für GPT)
+        hidden_dim = 4 * d_model
+        # Gewichte für alle Experten (batch matmul)
+        self.w_fc = nn.Parameter(torch.empty(n_experts, d_model, hidden_dim))
+        self.w_proj = nn.Parameter(torch.empty(n_experts, hidden_dim, d_model))
+        if bias:
+            self.fc_bias = nn.Parameter(torch.empty(n_experts, 1, hidden_dim))
+            self.proj_bias = nn.Parameter(torch.empty(n_experts, 1, d_model))
+        else:
+            self.register_parameter("fc_bias", None)
+            self.register_parameter("proj_bias", None)
+        # Aktivierungsfunktion
+        if activation == "gelu":
+            self.activation = nn.GELU()
+        elif activation == "relu":
+            self.activation = nn.ReLU()
+        elif activation == "swiglu":
+            # SwiGLU braucht extra Gewichte
+            self.w_gate = nn.Parameter(torch.empty(n_experts, d_model, hidden_dim))
+            self.activation = nn.SiLU()
+        else:
+            raise ValueError(f"Unbekannte Aktivierung: {activation}")
+        self.dropout = nn.Dropout(dropout)
+        self.activation_type = activation
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [n_experts, capacity, d_model]
+        Returns:
+            output: [n_experts, capacity, d_model]
+        """
+        # Erste Linear Layer mit batch matmul
+        h = torch.bmm(x, self.w_fc)
+        if self.bias:
+            h = h + self.fc_bias
+        # Aktivierung
+        if self.activation_type == "swiglu":
+            # SwiGLU: silu(x @ W_gate) * (x @ W_fc)
+            gate = torch.bmm(x, self.w_gate)
+            h = self.activation(gate) * h
+        else:
+            h = self.activation(h)
+        # Zweite Linear Layer
+        output = torch.bmm(h, self.w_proj)
+        if self.bias:
+            output = output + self.proj_bias
+        output = self.dropout(output)
+        return output
+class MoELayer(nn.Module):
+    """
+    Vollständige Mixture-of-Experts Layer.
+    Kombiniert Router und Experten.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_experts: int = 8,
+        n_experts_active: int = 2,
+        use_noisy_gating: bool = True,
+        capacity_factor: float = 1.25,
+        bias: bool = False,
+        dropout: float = 0.1,
+        activation: str = "gelu",
+    ):
+        super().__init__()
+        self.router = MoERouter(
+            d_model=d_model,
+            n_experts=n_experts,
+            n_experts_active=n_experts_active,
+            use_noisy_gating=use_noisy_gating,
+            capacity_factor=capacity_factor,
+        )
+        self.experts = ExpertMLP(
+            d_model=d_model,
+            n_experts=n_experts,
+            bias=bias,
+            dropout=dropout,
+            activation=activation,
+        )
+        self.n_experts = n_experts
+        self.n_experts_active = n_experts_active
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: [batch_size, seq_len, d_model]
+        Returns:
+            output: [batch_size, seq_len, d_model]
+            load_balance_loss: Skalarer Load Balancing Loss
+            router_z_loss: Skalarer Router Z-Loss
+        """
+        batch_size, seq_len, d_model = x.shape
+        num_tokens = batch_size * seq_len
+        # Routing
+        expert_weights, expert_mask, expert_batches, router_logits = self.router(x)
+        # Expert Forward Pass
+        expert_outputs = self.experts(expert_batches)  # [n_experts, capacity, d_model]
+        # Outputs kombinieren (gewichteter Durchschnitt)
+        expert_weights_flat = expert_weights.view(num_tokens, -1)  # [num_tokens, n_experts * capacity]
+        expert_outputs_flat = expert_outputs.view(-1, d_model)  # [n_experts * capacity, d_model]
+        output = expert_weights_flat @ expert_outputs_flat  # [num_tokens, d_model]
+        output = output.view(batch_size, seq_len, d_model)
+        # Auxiliary Losses berechnen
+        load_balance_loss = self._compute_load_balance_loss(router_logits, expert_mask)
+        router_z_loss = self._compute_router_z_loss(router_logits)
+        return output, load_balance_loss, router_z_loss
+    def _compute_load_balance_loss(
+        self, router_logits: torch.Tensor, expert_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Load Balancing Loss (Switch Transformer, Fedus et al. 2022)
+        Encourages uniform distribution of tokens across experts.
+        """
+        batch_size, seq_len, n_experts = router_logits.shape
+        num_tokens = batch_size * seq_len
+        # Probability pro Expert
+        router_probs = F.softmax(router_logits, dim=-1)  # [B, T, n_experts]
+        prob_per_expert = torch.mean(router_probs, dim=(0, 1))  # [n_experts]
+        # Token Ratio pro Expert
+        with torch.no_grad():
+            # expert_mask ist [num_tokens, n_experts, capacity]
+            tokens_per_expert = torch.sum(expert_mask.float(), dim=(0, 2))  # [n_experts]
+            tokens_per_expert = tokens_per_expert / (num_tokens * self.n_experts_active)
+        # Dot product (scaled by n_experts)
+        loss = self.n_experts * torch.sum(prob_per_expert * tokens_per_expert)
+        return loss
+    def _compute_router_z_loss(self, router_logits: torch.Tensor) -> torch.Tensor:
+        """
+        Router Z-Loss (ST-MoE, Zoph et al. 2022)
+        Penalisiert große Router Logits für numerische Stabilität.
+        """
+        # Squared logsumexp über Experten
+        z_loss = torch.logsumexp(router_logits, dim=-1) ** 2.0  # [B, T]
+        z_loss = torch.mean(z_loss)
+        return z_loss

moe_model.py ADDED Viewed

	@@ -0,0 +1,460 @@

+"""
+MoE GPT Model - HuggingFace kompatibel
+Basiert auf nanoMoE und dem Blog Post
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+from dataclasses import dataclass
+from transformers import PreTrainedModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from moe_config import MoEGPTConfig
+from moe_layers import MoELayer
+@dataclass
+class MoECausalLMOutput(CausalLMOutputWithPast):
+    """
+    Erweiterte Output Klasse mit MoE-spezifischen Losses
+    """
+    aux_loss: Optional[torch.FloatTensor] = None
+    router_z_loss: Optional[torch.FloatTensor] = None
+def apply_rotary_emb(x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor) -> torch.Tensor:
+    """
+    Applies Rotary Position Embeddings (RoPE) to input tensor.
+    Args:
+        x: Input tensor of shape [B, H, T, D]
+        freqs_cos: Cosine frequencies of shape [T, D//2]
+        freqs_sin: Sine frequencies of shape [T, D//2]
+    Returns:
+        Tensor with RoPE applied
+    """
+    # Reshape x to separate real and imaginary parts for rotation
+    # x: [B, H, T, D] -> [B, H, T, D//2, 2]
+    x_complex = x.float().reshape(*x.shape[:-1], -1, 2)
+    # Apply rotation: (a + bi) * (cos + i*sin) = (a*cos - b*sin) + i(a*sin + b*cos)
+    x_rot_real = x_complex[..., 0] * freqs_cos - x_complex[..., 1] * freqs_sin
+    x_rot_imag = x_complex[..., 0] * freqs_sin + x_complex[..., 1] * freqs_cos
+    # Stack back together and flatten
+    x_out = torch.stack([x_rot_real, x_rot_imag], dim=-1)
+    x_out = x_out.flatten(-2)
+    return x_out.type_as(x)
+def precompute_freqs_rope(dim: int, max_seq_len: int, theta: float = 10000.0) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precomputes RoPE frequencies.
+    Args:
+        dim: Head dimension
+        max_seq_len: Maximum sequence length
+        theta: RoPE theta parameter (base for frequency calculation)
+    Returns:
+        Tuple of (freqs_cos, freqs_sin) tensors of shape [max_seq_len, dim//2]
+    """
+    # Compute frequencies for each dimension pair
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+    # Create position indices
+    t = torch.arange(max_seq_len, dtype=torch.float32)
+    # Compute outer product: [max_seq_len, dim//2]
+    freqs = torch.outer(t, freqs)
+    # Compute cos and sin
+    freqs_cos = torch.cos(freqs)
+    freqs_sin = torch.sin(freqs)
+    return freqs_cos, freqs_sin
+class CausalSelfAttention(nn.Module):
+    """
+    Multi-Head Causal Self-Attention with Rotary Position Embeddings (RoPE).
+    Uses PyTorch SDPA for optimized performance.
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # Key, Query, Value für alle Heads gleichzeitig
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # Output Projektion
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # Regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        self.head_dim = config.n_embd // config.n_head
+        # Precompute RoPE frequencies
+        freqs_cos, freqs_sin = precompute_freqs_rope(
+            dim=self.head_dim,
+            max_seq_len=config.n_positions,
+            theta=config.rope_theta
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.size()  # batch, sequence length, embedding dim
+        # Q, K, V berechnen
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        # Reshape für Multi-Head
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # [B, H, T, d]
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K
+        q = apply_rotary_emb(q, self.freqs_cos[:T], self.freqs_sin[:T])
+        k = apply_rotary_emb(k, self.freqs_cos[:T], self.freqs_sin[:T])
+        # Use PyTorch SDPA (Scaled Dot Product Attention) - optimized!
+        # SDPA handles causal masking, dropout, and is memory efficient
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=None,  # Causal mask handled by is_causal
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=True  # Efficient causal masking
+        )  # [B, H, T, d]
+        # Reshape back
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        # Output Projektion
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    """
+    Standard Feed-Forward Network (für nicht-MoE Layers)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        if config.activation_function == "gelu":
+            self.activation = nn.GELU()
+        elif config.activation_function == "relu":
+            self.activation = nn.ReLU()
+        else:
+            raise ValueError(f"Unbekannte Aktivierung: {config.activation_function}")
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = self.activation(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    """
+    Standard Transformer Block (Attention + MLP)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class MoETransformerBlock(nn.Module):
+    """
+    MoE Transformer Block (Attention + MoE Layer)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        # Capacity Factor abhängig von Training/Eval
+        self.moe = MoELayer(
+            d_model=config.n_embd,
+            n_experts=config.n_experts,
+            n_experts_active=config.n_experts_active,
+            use_noisy_gating=config.use_noisy_gating,
+            capacity_factor=config.capacity_factor,
+            bias=config.bias,
+            dropout=config.dropout,
+            activation=config.activation_function,
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Attention
+        x = x + self.attn(self.ln_1(x))
+        # MoE Layer
+        moe_out, aux_loss, router_z_loss = self.moe(self.ln_2(x))
+        x = x + moe_out
+        return x, aux_loss, router_z_loss
+class MoEGPTPreTrainedModel(PreTrainedModel):
+    """
+    Base Klasse für MoE GPT mit HuggingFace PreTrainedModel
+    """
+    config_class = MoEGPTConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """
+        Weight Initialization nach ST-MoE (Zoph et al. 2022)
+        Truncated Normal mit reduzierter Std für MoE Stabilität
+        """
+        if isinstance(module, nn.Linear):
+            # Fan-in Initialization
+            fan_in = module.weight.shape[-1]
+            std = (self.config.initializer_range / fan_in) ** 0.5
+            torch.nn.init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=std,
+                a=-2 * std,
+                b=2 * std,
+            )
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.Parameter):
+            # Für Expert Parameter
+            fan_in = module.shape[-1] if len(module.shape) >= 2 else module.shape[0]
+            std = (self.config.initializer_range / fan_in) ** 0.5
+            torch.nn.init.trunc_normal_(
+                module,
+                mean=0.0,
+                std=std,
+                a=-2 * std,
+                b=2 * std,
+            )
+class MoEGPTModel(MoEGPTPreTrainedModel):
+    """
+    MoE GPT Model (ohne LM Head)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__(config)
+        self.config = config
+        self.gradient_checkpointing = False  # Für HF Gradient Checkpointing Support
+        # Token Embeddings only (RoPE handles positions)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        # Transformer Blocks (gemischt: Standard + MoE)
+        self.h = nn.ModuleList()
+        for i in range(config.n_layer):
+            if i % config.moe_layer_frequency == 0:
+                # MoE Block
+                self.h.append(MoETransformerBlock(config))
+            else:
+                # Standard Block
+                self.h.append(TransformerBlock(config))
+        # Final Layer Norm
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        # Initialize weights
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        device = input_ids.device
+        b, t = input_ids.size()
+        assert t <= self.config.n_positions, f"Sequenz zu lang: {t} > {self.config.n_positions}"
+        # Token Embeddings only (RoPE in attention layers)
+        tok_emb = self.wte(input_ids)  # [B, T, n_embd]
+        x = self.drop(tok_emb)
+        # Sammle Auxiliary Losses
+        total_aux_loss = 0.0
+        total_router_z_loss = 0.0
+        # Durch alle Blocks
+        for block in self.h:
+            if isinstance(block, MoETransformerBlock):
+                if self.gradient_checkpointing and self.training:
+                    # Gradient Checkpointing für MoE Blocks
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    x, aux_loss, router_z_loss = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x,
+                        use_reentrant=False
+                    )
+                else:
+                    x, aux_loss, router_z_loss = block(x)
+                total_aux_loss = total_aux_loss + aux_loss
+                total_router_z_loss = total_router_z_loss + router_z_loss
+            else:
+                if self.gradient_checkpointing and self.training:
+                    x = torch.utils.checkpoint.checkpoint(
+                        block,
+                        x,
+                        use_reentrant=False
+                    )
+                else:
+                    x = block(x)
+        x = self.ln_f(x)
+        return x, total_aux_loss, total_router_z_loss
+class MoEGPTForCausalLM(MoEGPTPreTrainedModel, GenerationMixin):
+    """
+    MoE GPT mit Language Modeling Head (für Pretraining)
+    Erbt von GenerationMixin für .generate() Support
+    """
+    # Teile HuggingFace mit, welche Weights geteilt sind
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__(config)
+        self.transformer = MoEGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Weight Tying (LM Head teilt Gewichte mit Token Embedding)
+        self.lm_head.weight = self.transformer.wte.weight
+        # Initialize weights
+        self.post_init()
+    def get_output_embeddings(self):
+        """Für HuggingFace Weight Tying"""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Für HuggingFace Weight Tying"""
+        self.lm_head = new_embeddings
+    def get_input_embeddings(self):
+        """Für HuggingFace Weight Tying"""
+        return self.transformer.wte
+    def set_input_embeddings(self, new_embeddings):
+        """Für HuggingFace Weight Tying"""
+        self.transformer.wte = new_embeddings
+    def tie_weights(self):
+        """
+        Tie lm_head weights to input embeddings (weight tying)
+        Called after loading checkpoint to fix missing lm_head.weight
+        """
+        self.lm_head.weight = self.transformer.wte.weight
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # Accept additional kwargs like use_cache for HuggingFace compatibility
+    ) -> Union[Tuple, MoECausalLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Forward durch Transformer
+        hidden_states, aux_loss, router_z_loss = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        # LM Head
+        if labels is not None:
+            # Training: nur letzte Position für jede Sequenz
+            logits = self.lm_head(hidden_states)
+        else:
+            # Inference: nur letzte Position
+            logits = self.lm_head(hidden_states[:, [-1], :])
+        # Loss berechnen
+        loss = None
+        if labels is not None:
+            # Shift für next token prediction
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Cross Entropy Loss
+            loss_fct = nn.CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+            # Auxiliary Losses hinzufügen
+            loss = lm_loss
+            if self.training:
+                loss = loss + self.config.aux_loss_alpha * aux_loss
+                loss = loss + self.config.router_z_loss_alpha * router_z_loss
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return MoECausalLMOutput(
+            loss=loss,
+            logits=logits,
+            aux_loss=aux_loss if self.training else None,
+            router_z_loss=router_z_loss if self.training else None,
+        )
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        """Für HuggingFace generate() Funktion"""
+        return {"input_ids": input_ids}