anthonym21
/

Eve-2-MoE-272M

@@ -1,169 +1,484 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import PreTrainedModel
-from .configuration_eve import EveConfig
 class RMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-5):
         super().__init__()
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
-def precompute_rope_freqs(head_dim, max_seq_len, theta=10000.0, device=None):
     freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
     t = torch.arange(max_seq_len, device=device).float()
     freqs = torch.outer(t, freqs)
-    return torch.polar(torch.ones_like(freqs), freqs)
-def apply_rope(x, freqs_cis):
     B, H, T, D = x.shape
     x_complex = torch.view_as_complex(x.float().reshape(B, H, T, D // 2, 2))
-    freqs_cis = freqs_cis[:T].view(1, 1, T, D // 2)
     x_rotated = x_complex * freqs_cis
     return torch.view_as_real(x_rotated).reshape(B, H, T, D).type_as(x)
 class MLP(nn.Module):
-    def __init__(self, config, intermediate_size=None):
         super().__init__()
         hidden_dim = intermediate_size or config.expert_intermediate_size
-        self.w1 = nn.Linear(config.n_embd, hidden_dim, bias=False)
-        self.w2 = nn.Linear(config.n_embd, hidden_dim, bias=False)
-        self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=False)
-    def forward(self, x):
         return self.c_proj(F.silu(self.w1(x)) * self.w2(x))
 class SharedMoE(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.top_k = config.top_k
         self.shared_expert = MLP(config, config.shared_expert_intermediate_size)
         self.experts = nn.ModuleList([MLP(config) for _ in range(config.num_experts)])
         self.router = nn.Linear(config.n_embd, config.num_experts, bias=False)
-    def forward(self, x):
         B, T, C = x.shape
         shared_out = self.shared_expert(x)
         logits = self.router(x)
         probs = F.softmax(logits, dim=-1)
         top_k_weights, top_k_indices = torch.topk(probs, self.top_k, dim=-1)
         top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
         flat_probs = probs.view(-1, self.config.num_experts)
         expert_usage = flat_probs.mean(dim=0)
         aux_loss = torch.sum(expert_usage * expert_usage) * self.config.num_experts
         routed_out = torch.zeros_like(x)
         flat_x = x.view(-1, C)
         flat_indices = top_k_indices.view(-1, self.top_k)
         flat_weights = top_k_weights.view(-1, self.top_k)
         for i, expert in enumerate(self.experts):
             mask = flat_indices == i
             batch_idx, rank_idx = torch.where(mask)
             if batch_idx.numel() > 0:
                 expert_input = flat_x[batch_idx]
                 expert_output = expert(expert_input)
                 weight = flat_weights[batch_idx, rank_idx].unsqueeze(-1)
                 routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
         return shared_out + routed_out, aux_loss
 class CausalSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.n_head = config.n_head
         self.head_dim = config.head_dim
         self.n_embd = config.n_embd
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
-    def forward(self, x, freqs_cis):
         B, T, C = x.shape
         qkv = self.c_attn(x)
         q, k, v = qkv.split(self.n_embd, dim=2)
         q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         q = apply_rope(q, freqs_cis)
         k = apply_rope(k, freqs_cis)
         y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
         y = y.transpose(1, 2).contiguous().view(B, T, C)
         return self.c_proj(y)
 class Block(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.ln_1 = RMSNorm(config.n_embd)
         self.ln_2 = RMSNorm(config.n_embd)
-        self.attn = CausalSelfAttention(config) # Named 'attn' to match safetensors
         self.mlp = SharedMoE(config)
-    def forward(self, x, freqs_cis):
-        attn_out = self.attn(self.ln_1(x), freqs_cis)
-        x = x + attn_out
         mlp_out, aux_loss = self.mlp(self.ln_2(x))
         x = x + mlp_out
         return x, aux_loss
-class DeepSeekMoE(PreTrainedModel):
-    config_class = EveConfig
-    _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config):
-        super().__init__(config)
         self.config = config
         self.transformer = nn.ModuleDict(dict(
             wte=nn.Embedding(config.vocab_size, config.n_embd),
             h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
             ln_f=RMSNorm(config.n_embd),
         ))
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        # Tie weights
         self.transformer.wte.weight = self.lm_head.weight
         freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
         self.register_buffer("freqs_cis", freqs_cis, persistent=False)
-    def get_input_embeddings(self):
-        return self.transformer.wte
-    def set_input_embeddings(self, value):
-        self.transformer.wte = value
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def forward(self, input_ids=None, idx=None, labels=None, targets=None, **kwargs):
-        if idx is None: idx = input_ids
-        if targets is None: targets = labels
         B, T = idx.shape
         x = self.transformer.wte(idx)
         total_aux_loss = 0.0
-        freqs_cis = self.freqs_cis.to(x.device)
         for block in self.transformer.h:
-            x, aux_loss = block(x, freqs_cis[:T])
             total_aux_loss += aux_loss
         x = self.transformer.ln_f(x)
         logits = self.lm_head(x)
         loss = None
         if targets is not None:
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
             loss = loss + self.config.router_aux_loss_coef * total_aux_loss
-        return (loss, logits) if loss is not None else logits
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        return {"input_ids": input_ids}

+"""
+Eve-2-MoE — Custom Mixture of Experts Language Model
+=====================================================
+Architecture: DeepSeek-V3 style Shared Expert + Top-K Routed Experts + RoPE
+Author: Anthony Maio / Making Minds AI Research
+License: MIT
+Usage (HuggingFace):
+    from transformers import AutoModelForCausalLM
+    model = AutoModelForCausalLM.from_pretrained(
+        "anthonym21/Eve-2-MoE-272M", trust_remote_code=True
+    )
+Usage (standalone):
+    from modeling_eve import ModelConfig, DeepSeekMoE
+    model = DeepSeekMoE(ModelConfig())
+"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import math
+from collections import OrderedDict
+from dataclasses import dataclass
+# ============================================================
+#  Standalone config (no transformers dependency)
+# ============================================================
+@dataclass
+class ModelConfig:
+    """Configuration for Eve-2-MoE (standalone, no HF dependency)."""
+    # Model dimensions
+    vocab_size: int = 50304
+    n_layer: int = 12
+    n_embd: int = 512
+    n_head: int = 8
+    head_dim: int = 64
+    block_size: int = 2048
+    # MoE settings
+    num_experts: int = 8
+    top_k: int = 2
+    expert_intermediate_size: int = 1408
+    shared_expert_intermediate_size: int = 1408
+    router_aux_loss_coef: float = 0.01
+    # Training settings
+    use_checkpointing: bool = False  # Gradient checkpointing (saves VRAM, costs speed)
+    # RoPE settings
+    rope_theta: float = 10000.0
+# ============================================================
+#  Utility: strip torch.compile prefix from state dicts
+# ============================================================
+def _strip_orig_mod_prefix(state_dict):
+    """Remove '_orig_mod.' prefix from keys saved by torch.compile'd models."""
+    cleaned = OrderedDict()
+    for k, v in state_dict.items():
+        cleaned[k.replace("_orig_mod.", "")] = v
+    return cleaned
+# ============================================================
+#  Building blocks (shared by standalone and HF models)
+# ============================================================
 class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+    def __init__(self, dim: int, eps: float = 1e-5):
         super().__init__()
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
+def precompute_rope_freqs(head_dim: int, max_seq_len: int, theta: float = 10000.0,
+                          device: torch.device = None) -> torch.Tensor:
+    """Precompute the complex exponential frequencies for RoPE.
+    Returns a (max_seq_len, head_dim // 2) complex tensor.
+    """
     freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
     t = torch.arange(max_seq_len, device=device).float()
     freqs = torch.outer(t, freqs)
+    return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+def apply_rope(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """Apply rotary position embeddings to input tensor.
+    Args:
+        x: (B, n_head, T, head_dim)
+        freqs_cis: (T, head_dim // 2) complex
+    Returns:
+        (B, n_head, T, head_dim) with rotary embeddings applied
+    """
+    # Reshape x to complex: (B, n_head, T, head_dim//2, 2) -> complex
     B, H, T, D = x.shape
     x_complex = torch.view_as_complex(x.float().reshape(B, H, T, D // 2, 2))
+    # Broadcast freqs_cis: (1, 1, T, head_dim//2)
+    freqs_cis = freqs_cis[:T].unsqueeze(0).unsqueeze(0)
     x_rotated = x_complex * freqs_cis
+    # Back to real: (B, H, T, head_dim)
     return torch.view_as_real(x_rotated).reshape(B, H, T, D).type_as(x)
 class MLP(nn.Module):
+    """Feed-forward network with SwiGLU activation."""
+    def __init__(self, config, intermediate_size: int = None):
         super().__init__()
         hidden_dim = intermediate_size or config.expert_intermediate_size
+        self.w1 = nn.Linear(config.n_embd, hidden_dim, bias=False)  # Gate
+        self.w2 = nn.Linear(config.n_embd, hidden_dim, bias=False)  # Up
+        self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=False)  # Down
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.c_proj(F.silu(self.w1(x)) * self.w2(x))
 class SharedMoE(nn.Module):
+    """Mixture of Experts with one shared expert and K routed experts.
+    DeepSeek-V3 style: a shared expert processes all tokens while a top-k
+    router selects from a pool of specialized experts per token.
+    """
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.top_k = config.top_k
+        # Shared expert (always active)
         self.shared_expert = MLP(config, config.shared_expert_intermediate_size)
+        # Routed experts
         self.experts = nn.ModuleList([MLP(config) for _ in range(config.num_experts)])
         self.router = nn.Linear(config.n_embd, config.num_experts, bias=False)
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         B, T, C = x.shape
+        # Shared path
         shared_out = self.shared_expert(x)
+        # Router
         logits = self.router(x)
         probs = F.softmax(logits, dim=-1)
+        # Top-K selection with normalized weights
         top_k_weights, top_k_indices = torch.topk(probs, self.top_k, dim=-1)
         top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
+        # Load balancing auxiliary loss
         flat_probs = probs.view(-1, self.config.num_experts)
         expert_usage = flat_probs.mean(dim=0)
         aux_loss = torch.sum(expert_usage * expert_usage) * self.config.num_experts
+        # Route tokens to experts
         routed_out = torch.zeros_like(x)
         flat_x = x.view(-1, C)
         flat_indices = top_k_indices.view(-1, self.top_k)
         flat_weights = top_k_weights.view(-1, self.top_k)
         for i, expert in enumerate(self.experts):
             mask = flat_indices == i
             batch_idx, rank_idx = torch.where(mask)
             if batch_idx.numel() > 0:
                 expert_input = flat_x[batch_idx]
                 expert_output = expert(expert_input)
                 weight = flat_weights[batch_idx, rank_idx].unsqueeze(-1)
                 routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
         return shared_out + routed_out, aux_loss
 class CausalSelfAttention(nn.Module):
+    """Multi-head causal self-attention with Rotary Position Embeddings."""
     def __init__(self, config):
         super().__init__()
         self.n_head = config.n_head
         self.head_dim = config.head_dim
         self.n_embd = config.n_embd
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
         B, T, C = x.shape
         qkv = self.c_attn(x)
         q, k, v = qkv.split(self.n_embd, dim=2)
         q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K
         q = apply_rope(q, freqs_cis)
         k = apply_rope(k, freqs_cis)
+        # Flash Attention (auto-dispatches to cuDNN/FlashAttn kernels)
         y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
         y = y.transpose(1, 2).contiguous().view(B, T, C)
         return self.c_proj(y)
 class Block(nn.Module):
+    """Transformer block: RMSNorm -> Attention -> RMSNorm -> MoE."""
     def __init__(self, config):
         super().__init__()
         self.ln_1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
         self.ln_2 = RMSNorm(config.n_embd)
         self.mlp = SharedMoE(config)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        x = x + self.attn(self.ln_1(x), freqs_cis)
         mlp_out, aux_loss = self.mlp(self.ln_2(x))
         x = x + mlp_out
         return x, aux_loss
+# ============================================================
+#  Standalone model (backward compatible, no HF dependency)
+# ============================================================
+class DeepSeekMoE(nn.Module):
+    """Eve-2-MoE: DeepSeek-V3 style Mixture of Experts language model.
+    Standalone nn.Module — works without the transformers library.
+    For HuggingFace integration, use EveMoEForCausalLM instead.
+    Architecture:
+        - Token embeddings (no learned position embeddings — uses RoPE)
+        - N transformer blocks with RoPE attention + shared MoE FFN
+        - RMSNorm + tied linear head
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
         self.config = config
         self.transformer = nn.ModuleDict(dict(
             wte=nn.Embedding(config.vocab_size, config.n_embd),
             h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
             ln_f=RMSNorm(config.n_embd),
         ))
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Weight tying
         self.transformer.wte.weight = self.lm_head.weight
+        # Precompute RoPE frequencies (registered as buffer so they move with .to(device))
         freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
         self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx: torch.Tensor, targets: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
         B, T = idx.shape
+        assert T <= self.config.block_size, f"Sequence length {T} exceeds block_size {self.config.block_size}"
         x = self.transformer.wte(idx)
         total_aux_loss = 0.0
         for block in self.transformer.h:
+            if self.config.use_checkpointing and self.training:
+                x, aux_loss = torch.utils.checkpoint.checkpoint(
+                    block, x, self.freqs_cis, use_reentrant=False
+                )
+            else:
+                x, aux_loss = block(x, self.freqs_cis)
             total_aux_loss += aux_loss
         x = self.transformer.ln_f(x)
         logits = self.lm_head(x)
         loss = None
         if targets is not None:
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
             loss = loss + self.config.router_aux_loss_coef * total_aux_loss
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx: torch.Tensor, max_new_tokens: int,
+                 temperature: float = 0.8, top_k: int = 50) -> torch.Tensor:
+        """Autoregressive generation with temperature and top-k sampling."""
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -self.config.block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+# ============================================================
+#  HuggingFace PreTrainedModel integration
+#  (only available when transformers is installed)
+# ============================================================
+try:
+    from transformers import PreTrainedModel
+    from transformers.modeling_outputs import CausalLMOutputWithPast
+    try:
+        from .configuration_eve import EveConfig
+    except ImportError:
+        from configuration_eve import EveConfig
+    class EveMoEPreTrainedModel(PreTrainedModel):
+        """Base class for Eve-2-MoE HuggingFace models."""
+        config_class = EveConfig
+        base_model_prefix = "transformer"
+        supports_gradient_checkpointing = True
+        _no_split_modules = ["Block"]
+        def _init_weights(self, module):
+            std = 0.02
+            if isinstance(module, nn.Linear):
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, nn.Embedding):
+                module.weight.data.normal_(mean=0.0, std=std)
+    class EveMoEForCausalLM(EveMoEPreTrainedModel):
+        """Eve-2-MoE for causal language modeling (HuggingFace compatible).
+        This model has the same weights and architecture as DeepSeekMoE but
+        follows HuggingFace conventions for from_pretrained() and generate().
+        Usage:
+            from transformers import AutoModelForCausalLM
+            model = AutoModelForCausalLM.from_pretrained(
+                "anthonym21/Eve-2-MoE-272M", trust_remote_code=True
+            )
+            output = model.generate(input_ids, max_new_tokens=100)
+        """
+        _tied_weights_keys = ["lm_head.weight"]
+        def __init__(self, config: EveConfig):
+            super().__init__(config)
+            self.transformer = nn.ModuleDict(dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=RMSNorm(config.n_embd),
+            ))
+            self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+            # Precompute RoPE frequencies
+            freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
+            self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+            # Initialize weights and apply final processing
+            self.post_init()
+        def get_input_embeddings(self):
+            return self.transformer.wte
+        def set_input_embeddings(self, value):
+            self.transformer.wte = value
+        def get_output_embeddings(self):
+            return self.lm_head
+        def set_output_embeddings(self, new_embeddings):
+            self.lm_head = new_embeddings
+        def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: torch.Tensor = None,
+            labels: torch.LongTensor = None,
+            return_dict: bool = None,
+            **kwargs,
+        ):
+            """
+            Args:
+                input_ids: Token IDs, shape (batch, seq_len).
+                attention_mask: Ignored (model uses causal mask via Flash Attention).
+                    Accepted for pipeline/generate() compatibility.
+                labels: Language modeling labels. Same shape as input_ids.
+                    The loss is computed with internal shift (labels[..., 1:] predicted
+                    from input[..., :-1]), following HuggingFace convention.
+                return_dict: Whether to return a CausalLMOutputWithPast or a tuple.
+            """
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+            B, T = input_ids.shape
+            assert T <= self.config.block_size, \
+                f"Sequence length {T} exceeds block_size {self.config.block_size}"
+            x = self.transformer.wte(input_ids)
+            total_aux_loss = 0.0
+            for block in self.transformer.h:
+                if self.config.use_checkpointing and self.training:
+                    x, aux_loss = torch.utils.checkpoint.checkpoint(
+                        block, x, self.freqs_cis, use_reentrant=False
+                    )
+                else:
+                    x, aux_loss = block(x, self.freqs_cis)
+                total_aux_loss += aux_loss
+            x = self.transformer.ln_f(x)
+            logits = self.lm_head(x)
+            loss = None
+            if labels is not None:
+                # Shift so that tokens < n predict n (HF convention)
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                loss = F.cross_entropy(
+                    shift_logits.view(-1, self.config.vocab_size),
+                    shift_labels.view(-1),
+                )
+                loss = loss + self.config.router_aux_loss_coef * total_aux_loss
+            if not return_dict:
+                output = (logits,)
+                return (loss,) + output if loss is not None else output
+            return CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+            )
+        def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
+            # Truncate to block_size for models without KV cache
+            if input_ids.shape[1] > self.config.block_size:
+                input_ids = input_ids[:, -self.config.block_size:]
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, -self.config.block_size:]
+            return {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+            }
+        def load_state_dict(self, state_dict, *args, **kwargs):
+            """Override to handle weights saved from torch.compile'd models."""
+            # Strip _orig_mod. prefix if present (torch.compile artifact)
+            if any(k.startswith("_orig_mod.") for k in state_dict.keys()):
+                state_dict = _strip_orig_mod_prefix(state_dict)
+            return super().load_state_dict(state_dict, *args, **kwargs)
+except ImportError:
+    # transformers not installed — standalone usage only (DeepSeekMoE + ModelConfig)
+    pass