anthonym21
/

Eve-2-MoE-272M

@@ -72,26 +72,18 @@ class SharedMoE(nn.Module):
                 routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
         return shared_out + routed_out, aux_loss
-class Block(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.ln_1 = RMSNorm(config.n_embd)
-        self.ln_2 = RMSNorm(config.n_embd)
-        # Attention components
         self.n_head = config.n_head
         self.head_dim = config.head_dim
         self.n_embd = config.n_embd
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
-        self.mlp = SharedMoE(config)
     def forward(self, x, freqs_cis):
-        # Attention Block
         B, T, C = x.shape
-        h = self.ln_1(x)
-        qkv = self.c_attn(h)
         q, k, v = qkv.split(self.n_embd, dim=2)
         q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
@@ -100,17 +92,26 @@ class Block(nn.Module):
         k = apply_rope(k, freqs_cis)
         y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
         y = y.transpose(1, 2).contiguous().view(B, T, C)
-        attn_out = self.c_proj(y)
         x = x + attn_out
-        # MoE Block
         mlp_out, aux_loss = self.mlp(self.ln_2(x))
         x = x + mlp_out
         return x, aux_loss
 class DeepSeekMoE(PreTrainedModel):
     config_class = EveConfig
-    _tied_weights_keys = ["lm_head.weight"]  # <--- THE FIX IS HERE
     def __init__(self, config):
         super().__init__(config)
@@ -122,7 +123,7 @@ class DeepSeekMoE(PreTrainedModel):
         ))
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        # Tie weights manually (HF checks this flag)
         self.transformer.wte.weight = self.lm_head.weight
         freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
@@ -148,7 +149,6 @@ class DeepSeekMoE(PreTrainedModel):
         x = self.transformer.wte(idx)
         total_aux_loss = 0.0
-        # Ensure rope freqs are on correct device
         freqs_cis = self.freqs_cis.to(x.device)
         for block in self.transformer.h:

                 routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
         return shared_out + routed_out, aux_loss
+class CausalSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.n_head = config.n_head
         self.head_dim = config.head_dim
         self.n_embd = config.n_embd
         self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
         self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
     def forward(self, x, freqs_cis):
         B, T, C = x.shape
+        qkv = self.c_attn(x)
         q, k, v = qkv.split(self.n_embd, dim=2)
         q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
         k = apply_rope(k, freqs_cis)
         y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
         y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd)
+        self.ln_2 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config) # Named 'attn' to match safetensors
+        self.mlp = SharedMoE(config)
+    def forward(self, x, freqs_cis):
+        attn_out = self.attn(self.ln_1(x), freqs_cis)
         x = x + attn_out
         mlp_out, aux_loss = self.mlp(self.ln_2(x))
         x = x + mlp_out
         return x, aux_loss
 class DeepSeekMoE(PreTrainedModel):
     config_class = EveConfig
+    _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
         ))
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Tie weights
         self.transformer.wte.weight = self.lm_head.weight
         freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
         x = self.transformer.wte(idx)
         total_aux_loss = 0.0
         freqs_cis = self.freqs_cis.to(x.device)
         for block in self.transformer.h: