anthonym21
/

Eve-2-MoE-250M

Model card Files Files and versions

xet

Community

anthonym21 commited on Feb 6

Commit

1010007

1 Parent(s): 9f12aaa

Upload modeling_eve.py with huggingface_hub

Browse files

Files changed (1) hide show

modeling_eve.py +286 -0

modeling_eve.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""
+Eve-2-MoE — Custom Mixture of Experts Language Model
+Architecture: DeepSeek-V3 style Shared Expert + Top-K Routed Experts + RoPE
+Author: Anthony Maio / Making Minds AI Research
+License: MIT
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from dataclasses import dataclass
+@dataclass
+class ModelConfig:
+    """Configuration for Eve-2-MoE."""
+    # Model dimensions
+    vocab_size: int = 50304
+    n_layer: int = 12
+    n_embd: int = 512
+    n_head: int = 8
+    head_dim: int = 64
+    block_size: int = 2048
+    # MoE settings
+    num_experts: int = 8
+    top_k: int = 2
+    expert_intermediate_size: int = 1408
+    shared_expert_intermediate_size: int = 1408
+    router_aux_loss_coef: float = 0.01
+    # Training settings
+    use_checkpointing: bool = False  # Gradient checkpointing (saves VRAM, costs speed)
+    # RoPE settings
+    rope_theta: float = 10000.0
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
+def precompute_rope_freqs(head_dim: int, max_seq_len: int, theta: float = 10000.0,
+                          device: torch.device = None) -> torch.Tensor:
+    """Precompute the complex exponential frequencies for RoPE.
+    Returns a (max_seq_len, head_dim // 2) complex tensor.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+    t = torch.arange(max_seq_len, device=device).float()
+    freqs = torch.outer(t, freqs)
+    return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+def apply_rope(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """Apply rotary position embeddings to input tensor.
+    Args:
+        x: (B, n_head, T, head_dim)
+        freqs_cis: (T, head_dim // 2) complex
+    Returns:
+        (B, n_head, T, head_dim) with rotary embeddings applied
+    """
+    # Reshape x to complex: (B, n_head, T, head_dim//2, 2) -> complex
+    B, H, T, D = x.shape
+    x_complex = torch.view_as_complex(x.float().reshape(B, H, T, D // 2, 2))
+    # Broadcast freqs_cis: (1, 1, T, head_dim//2)
+    freqs_cis = freqs_cis[:T].unsqueeze(0).unsqueeze(0)
+    x_rotated = x_complex * freqs_cis
+    # Back to real: (B, H, T, head_dim)
+    return torch.view_as_real(x_rotated).reshape(B, H, T, D).type_as(x)
+class MLP(nn.Module):
+    """Feed-forward network with SwiGLU activation."""
+    def __init__(self, config: ModelConfig, intermediate_size: int = None):
+        super().__init__()
+        hidden_dim = intermediate_size or config.expert_intermediate_size
+        self.w1 = nn.Linear(config.n_embd, hidden_dim, bias=False)  # Gate
+        self.w2 = nn.Linear(config.n_embd, hidden_dim, bias=False)  # Up
+        self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=False)  # Down
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.c_proj(F.silu(self.w1(x)) * self.w2(x))
+class SharedMoE(nn.Module):
+    """Mixture of Experts with one shared expert and K routed experts.
+    DeepSeek-V3 style: a shared expert processes all tokens while a top-k
+    router selects from a pool of specialized experts per token.
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.top_k
+        # Shared expert (always active)
+        self.shared_expert = MLP(config, config.shared_expert_intermediate_size)
+        # Routed experts
+        self.experts = nn.ModuleList([MLP(config) for _ in range(config.num_experts)])
+        self.router = nn.Linear(config.n_embd, config.num_experts, bias=False)
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        B, T, C = x.shape
+        # Shared path
+        shared_out = self.shared_expert(x)
+        # Router
+        logits = self.router(x)
+        probs = F.softmax(logits, dim=-1)
+        # Top-K selection with normalized weights
+        top_k_weights, top_k_indices = torch.topk(probs, self.top_k, dim=-1)
+        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
+        # Load balancing auxiliary loss
+        flat_probs = probs.view(-1, self.config.num_experts)
+        expert_usage = flat_probs.mean(dim=0)
+        aux_loss = torch.sum(expert_usage * expert_usage) * self.config.num_experts
+        # Route tokens to experts
+        routed_out = torch.zeros_like(x)
+        flat_x = x.view(-1, C)
+        flat_indices = top_k_indices.view(-1, self.top_k)
+        flat_weights = top_k_weights.view(-1, self.top_k)
+        for i, expert in enumerate(self.experts):
+            mask = flat_indices == i
+            batch_idx, rank_idx = torch.where(mask)
+            if batch_idx.numel() > 0:
+                expert_input = flat_x[batch_idx]
+                expert_output = expert(expert_input)
+                weight = flat_weights[batch_idx, rank_idx].unsqueeze(-1)
+                routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
+        return shared_out + routed_out, aux_loss
+class CausalSelfAttention(nn.Module):
+    """Multi-head causal self-attention with Rotary Position Embeddings."""
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_embd = config.n_embd
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K
+        q = apply_rope(q, freqs_cis)
+        k = apply_rope(k, freqs_cis)
+        # Flash Attention (auto-dispatches to cuDNN/FlashAttn kernels)
+        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class Block(nn.Module):
+    """Transformer block: RMSNorm → Attention → RMSNorm → MoE."""
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = RMSNorm(config.n_embd)
+        self.mlp = SharedMoE(config)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        x = x + self.attn(self.ln_1(x), freqs_cis)
+        mlp_out, aux_loss = self.mlp(self.ln_2(x))
+        x = x + mlp_out
+        return x, aux_loss
+class DeepSeekMoE(nn.Module):
+    """Eve-2-MoE: DeepSeek-V3 style Mixture of Experts language model.
+    Architecture:
+        - Token embeddings (no learned position embeddings — uses RoPE)
+        - N transformer blocks with RoPE attention + shared MoE FFN
+        - RMSNorm + tied linear head
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size, config.n_embd),
+            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=RMSNorm(config.n_embd),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Weight tying
+        self.transformer.wte.weight = self.lm_head.weight
+        # Precompute RoPE frequencies (registered as buffer so they move with .to(device))
+        freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx: torch.Tensor, targets: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
+        B, T = idx.shape
+        assert T <= self.config.block_size, f"Sequence length {T} exceeds block_size {self.config.block_size}"
+        x = self.transformer.wte(idx)
+        total_aux_loss = 0.0
+        for block in self.transformer.h:
+            if self.config.use_checkpointing and self.training:
+                x, aux_loss = torch.utils.checkpoint.checkpoint(
+                    block, x, self.freqs_cis, use_reentrant=False
+                )
+            else:
+                x, aux_loss = block(x, self.freqs_cis)
+            total_aux_loss += aux_loss
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+            loss = loss + self.config.router_aux_loss_coef * total_aux_loss
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx: torch.Tensor, max_new_tokens: int,
+                 temperature: float = 0.8, top_k: int = 50) -> torch.Tensor:
+        """Autoregressive generation with temperature and top-k sampling."""
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -self.config.block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx