Johnblick187
/

Grok-2

@@ -1,34 +1,6 @@
 """
-modeling_grok2.py — Custom Grok 2 modeling code for transformers.
-Allows AutoModel to load Johnblick187/grok-2.
-Exact tensor key names:
-  model.embed_tokens.weight                              [131072, 8192]
-  model.layers.N.pre_attn_norm.weight                   [8192]
-  model.layers.N.post_attn_norm.weight                  [8192]
-  model.layers.N.pre_moe_norm.weight                    [8192]
-  model.layers.N.post_moe_norm.weight                   [8192]
-  model.layers.N.self_attn.q_proj.weight                [8192, 8192]
-  model.layers.N.self_attn.k_proj.weight                [1024, 8192]
-  model.layers.N.self_attn.v_proj.weight                [1024, 8192]
-  model.layers.N.self_attn.o_proj.weight                [8192, 8192]
-  model.layers.N.mlp.gate_proj.weight                   [32768, 8192]
-  model.layers.N.mlp.up_proj.weight                     [32768, 8192]
-  model.layers.N.mlp.down_proj.weight                   [8192, 32768]
-  model.layers.N.block_sparse_moe.gate.weight           [8, 8192]
-  model.layers.N.block_sparse_moe.experts.E.w1.weight   [16384, 8192]
-  model.layers.N.block_sparse_moe.experts.E.w2.weight   [8192, 16384]
-  model.layers.N.block_sparse_moe.experts.E.w3.weight   [16384, 8192]
-  model.norm.weight                                     [8192]
-  lm_head.weight                                        [131072, 8192]
-Architecture:
-  64 layers, hidden=8192, 64 attn heads, 8 KV heads, head_dim=128
-  Dense residual MLP (SwiGLU): gate_proj, up_proj, down_proj
-  Sparse MoE: 8 experts, top-2, SwiGLU (w1=gate, w3=up, w2=down)
-  4x RMSNorm per layer (no bias)
-  RoPE with scaled theta
-  KV cache disabled — forward pass only, no past_key_values
 """
 import math
@@ -40,7 +12,6 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers import AutoConfig, AutoModelForCausalLM
-# ── Config ────────────────────────────────────────────────────────────────────
 class Grok2Config(PretrainedConfig):
     model_type = "grok2"
@@ -96,7 +67,6 @@ class Grok2Config(PretrainedConfig):
         )
-# ── RMSNorm ───────────────────────────────────────────────────────────────────
 class Grok2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
@@ -105,15 +75,16 @@ class Grok2RMSNorm(nn.Module):
     def forward(self, x):
         variance = x.pow(2).mean(-1, keepdim=True)
-        return self.weight * x * torch.rsqrt(variance + self.eps)
-# ── RoPE ──────────────────────────────────────────────────────────────────────
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
     return torch.cat([-x2, x1], dim=-1)
 def apply_rotary_emb(q, k, cos, sin):
     return (q * cos) + (rotate_half(q) * sin), \
            (k * cos) + (rotate_half(k) * sin)
@@ -140,7 +111,6 @@ class Grok2RotaryEmbedding(nn.Module):
                self.sin_cached[:, :, :seq_len, :]
-# ── Attention ─────────────────────────────────────────────────────────────────
 class Grok2Attention(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
@@ -158,17 +128,18 @@ class Grok2Attention(nn.Module):
     def forward(self, hidden_states, attention_mask=None, **kwargs):
         B, T, _ = hidden_states.shape
         q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
-        cos, sin = self.rotary_emb(T, hidden_states.device)
-        cos = cos[:, :, :T, :self.head_dim]
-        sin = sin[:, :, :T, :self.head_dim]
         q, k = apply_rotary_emb(q, k, cos, sin)
-        # GQA expand
         k = k.repeat_interleave(self.num_kv_groups, dim=1)
         v = v.repeat_interleave(self.num_kv_groups, dim=1)
@@ -181,21 +152,20 @@ class Grok2Attention(nn.Module):
             attn = attn * self.attn_softcap
         causal = torch.triu(
-            torch.full((T, T), float("-inf"), device=q.device, dtype=q.dtype),
             diagonal=1
         )
         attn = attn + causal.unsqueeze(0).unsqueeze(0)
         if attention_mask is not None:
-            attn = attn + attention_mask
-        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype)
         out = torch.matmul(attn, v)
         out = out.transpose(1, 2).contiguous().view(B, T, -1)
         return self.o_proj(out)
-# ── MoE Expert ────────────────────────────────────────────────────────────────
 class Grok2Expert(nn.Module):
     def __init__(self, hidden_size, moe_intermediate_size):
         super().__init__()
@@ -207,14 +177,12 @@ class Grok2Expert(nn.Module):
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
-# ── Sparse MoE ────────────────────────────────────────────────────────────────
 class Grok2SparseMoE(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         self.num_experts = config.num_local_experts
         self.top_k = config.num_experts_per_tok
         self.router_softcap = config.router_logit_softcapping
         self.gate = nn.Linear(config.hidden_size, config.num_local_experts, bias=False)
         self.experts = nn.ModuleList([
             Grok2Expert(config.hidden_size, config.moe_intermediate_size)
@@ -224,31 +192,33 @@ class Grok2SparseMoE(nn.Module):
     def forward(self, x):
         B, T, H = x.shape
         x_flat = x.view(-1, H)
         router_logits = self.gate(x_flat)
         if self.router_softcap > 0:
-            router_logits = router_logits / self.router_softcap
-            router_logits = torch.tanh(router_logits)
-            router_logits = router_logits * self.router_softcap
-        router_weights = F.softmax(router_logits, dim=-1)
         top_weights, top_indices = router_weights.topk(self.top_k, dim=-1)
         top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True)
         out = torch.zeros_like(x_flat)
         for k in range(self.top_k):
-            expert_idx = top_indices[:, k]
-            weight = top_weights[:, k].unsqueeze(-1)
             for e in range(self.num_experts):
-                mask = (expert_idx == e)
-                if mask.any():
-                    out[mask] += weight[mask] * self.experts[e](x_flat[mask])
         return out.view(B, T, H)
-# ── Dense MLP ─────────────────────────────────────────────────────────────────
 class Grok2MLP(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
@@ -260,7 +230,6 @@ class Grok2MLP(nn.Module):
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
-# ── Decoder Layer ─────────────────────────────────────────────────────────────
 class Grok2DecoderLayer(nn.Module):
     def __init__(self, config: Grok2Config, layer_idx: int):
         super().__init__()
@@ -274,25 +243,22 @@ class Grok2DecoderLayer(nn.Module):
         self.post_moe_norm    = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps)
     def forward(self, hidden_states, attention_mask=None, **kwargs):
-        # Attention
         residual = hidden_states
         hidden_states = self.pre_attn_norm(hidden_states)
         hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask)
         hidden_states = self.post_attn_norm(hidden_states)
         hidden_states = residual + hidden_states
-        # MoE + dense residual
         residual = hidden_states
         hidden_states = self.pre_moe_norm(hidden_states)
         moe_out = self.block_sparse_moe(hidden_states)
         mlp_out = self.mlp(hidden_states)
-        hidden_states = self.post_moe_norm(moe_out + mlp_out)
         hidden_states = residual + hidden_states
         return hidden_states
-# ── Model ─────────────────────────────────────────────────────────────────────
 class Grok2Model(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
@@ -310,7 +276,6 @@ class Grok2Model(nn.Module):
         return self.norm(hidden_states)
-# ── CausalLM ──────────────────────────────────────────────────────────────────
 class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = Grok2Config
     base_model_prefix = "model"
@@ -338,13 +303,10 @@ class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
         **kwargs,
     ):
         hidden_states = self.model(input_ids, attention_mask=attention_mask)
         logits = self.lm_head(hidden_states) * self.output_multiplier_scale
         if self.final_logit_softcapping > 0:
-            logits = logits / self.final_logit_softcapping
-            logits = torch.tanh(logits)
-            logits = logits * self.final_logit_softcapping
         loss = None
         if labels is not None:
@@ -356,16 +318,11 @@ class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
                 ignore_index=-100,
             )
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=None,
-        )
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
         return {"input_ids": input_ids}
-# ── Register ──────────────────────────────────────────────────────────────────
 AutoConfig.register("grok2", Grok2Config)
 AutoModelForCausalLM.register(Grok2Config, Grok1ForCausalLM)

 """
+modeling_grok2.py — Grok 2 modeling code for transformers.
+Pure bf16, device-aware MoE, no dtype casting.
 """
 import math
 from transformers import AutoConfig, AutoModelForCausalLM
 class Grok2Config(PretrainedConfig):
     model_type = "grok2"
         )
 class Grok2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
     def forward(self, x):
         variance = x.pow(2).mean(-1, keepdim=True)
+        return self.weight.to(x.device) * x * torch.rsqrt(variance + self.eps)
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
     return torch.cat([-x2, x1], dim=-1)
 def apply_rotary_emb(q, k, cos, sin):
+    cos = cos.to(q.device, q.dtype)
+    sin = sin.to(q.device, q.dtype)
     return (q * cos) + (rotate_half(q) * sin), \
            (k * cos) + (rotate_half(k) * sin)
                self.sin_cached[:, :, :seq_len, :]
 class Grok2Attention(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
     def forward(self, hidden_states, attention_mask=None, **kwargs):
         B, T, _ = hidden_states.shape
+        dtype = hidden_states.dtype
+        device = hidden_states.device
         q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(T, device)
+        cos = cos[:, :, :T, :self.head_dim].to(dtype)
+        sin = sin[:, :, :T, :self.head_dim].to(dtype)
         q, k = apply_rotary_emb(q, k, cos, sin)
         k = k.repeat_interleave(self.num_kv_groups, dim=1)
         v = v.repeat_interleave(self.num_kv_groups, dim=1)
             attn = attn * self.attn_softcap
         causal = torch.triu(
+            torch.full((T, T), float("-inf"), device=device, dtype=dtype),
             diagonal=1
         )
         attn = attn + causal.unsqueeze(0).unsqueeze(0)
         if attention_mask is not None:
+            attn = attn + attention_mask.to(device, dtype)
+        attn = F.softmax(attn, dim=-1).to(dtype)
         out = torch.matmul(attn, v)
         out = out.transpose(1, 2).contiguous().view(B, T, -1)
         return self.o_proj(out)
 class Grok2Expert(nn.Module):
     def __init__(self, hidden_size, moe_intermediate_size):
         super().__init__()
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 class Grok2SparseMoE(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         self.num_experts = config.num_local_experts
         self.top_k = config.num_experts_per_tok
         self.router_softcap = config.router_logit_softcapping
         self.gate = nn.Linear(config.hidden_size, config.num_local_experts, bias=False)
         self.experts = nn.ModuleList([
             Grok2Expert(config.hidden_size, config.moe_intermediate_size)
     def forward(self, x):
         B, T, H = x.shape
         x_flat = x.view(-1, H)
+        dtype = x_flat.dtype
         router_logits = self.gate(x_flat)
         if self.router_softcap > 0:
+            router_logits = torch.tanh(router_logits / self.router_softcap) * self.router_softcap
+        router_weights = F.softmax(router_logits, dim=-1).to(dtype)
         top_weights, top_indices = router_weights.topk(self.top_k, dim=-1)
         top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True)
         out = torch.zeros_like(x_flat)
         for k in range(self.top_k):
+            expert_ids = top_indices[:, k]
+            weights = top_weights[:, k].unsqueeze(-1)
             for e in range(self.num_experts):
+                mask = (expert_ids == e)
+                if not mask.any():
+                    continue
+                expert_device = next(self.experts[e].parameters()).device
+                x_e = x_flat[mask].to(expert_device)
+                w_e = weights[mask].to(expert_device)
+                y_e = self.experts[e](x_e) * w_e
+                out[mask] += y_e.to(out.device)
         return out.view(B, T, H)
 class Grok2MLP(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
 class Grok2DecoderLayer(nn.Module):
     def __init__(self, config: Grok2Config, layer_idx: int):
         super().__init__()
         self.post_moe_norm    = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps)
     def forward(self, hidden_states, attention_mask=None, **kwargs):
         residual = hidden_states
         hidden_states = self.pre_attn_norm(hidden_states)
         hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask)
         hidden_states = self.post_attn_norm(hidden_states)
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.pre_moe_norm(hidden_states)
         moe_out = self.block_sparse_moe(hidden_states)
         mlp_out = self.mlp(hidden_states)
+        hidden_states = self.post_moe_norm(moe_out.to(mlp_out.device) + mlp_out)
         hidden_states = residual + hidden_states
         return hidden_states
 class Grok2Model(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         return self.norm(hidden_states)
 class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = Grok2Config
     base_model_prefix = "model"
         **kwargs,
     ):
         hidden_states = self.model(input_ids, attention_mask=attention_mask)
         logits = self.lm_head(hidden_states) * self.output_multiplier_scale
         if self.final_logit_softcapping > 0:
+            logits = torch.tanh(logits / self.final_logit_softcapping) * self.final_logit_softcapping
         loss = None
         if labels is not None:
                 ignore_index=-100,
             )
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=None)
     def prepare_inputs_for_generation(self, input_ids, **kwargs):
         return {"input_ids": input_ids}
 AutoConfig.register("grok2", Grok2Config)
 AutoModelForCausalLM.register(Grok2Config, Grok1ForCausalLM)