Johnblick187
/

Grok-2

@@ -1,6 +1,6 @@
 """
-modeling_grok2.py — Grok 2 modeling code for transformers.
-Pure bf16, device-aware MoE, no dtype casting.
 """
 import math
@@ -12,6 +12,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers import AutoConfig, AutoModelForCausalLM
 class Grok2Config(PretrainedConfig):
     model_type = "grok2"
@@ -67,6 +68,7 @@ class Grok2Config(PretrainedConfig):
         )
 class Grok2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
@@ -74,17 +76,18 @@ class Grok2RMSNorm(nn.Module):
         self.eps = eps
     def forward(self, x):
         variance = x.pow(2).mean(-1, keepdim=True)
-        return self.weight.to(x.device) * x * torch.rsqrt(variance + self.eps)
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
     return torch.cat([-x2, x1], dim=-1)
 def apply_rotary_emb(q, k, cos, sin):
-    cos = cos.to(q.device, q.dtype)
-    sin = sin.to(q.device, q.dtype)
     return (q * cos) + (rotate_half(q) * sin), \
            (k * cos) + (rotate_half(k) * sin)
@@ -93,24 +96,25 @@ class Grok2RotaryEmbedding(nn.Module):
         super().__init__()
         base = base * scaling_factor
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq)
         self._cached_len = 0
-    def _build_cache(self, seq_len, device):
         t = torch.arange(seq_len, device=device).float()
         freqs = torch.outer(t, self.inv_freq.to(device))
         emb = torch.cat([freqs, freqs], dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
         self._cached_len = seq_len
-    def forward(self, seq_len, device):
-        if seq_len > self._cached_len:
-            self._build_cache(seq_len, device)
-        return self.cos_cached[:, :, :seq_len, :], \
-               self.sin_cached[:, :, :seq_len, :]
 class Grok2Attention(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
@@ -128,18 +132,19 @@ class Grok2Attention(nn.Module):
     def forward(self, hidden_states, attention_mask=None, **kwargs):
         B, T, _ = hidden_states.shape
-        dtype = hidden_states.dtype
         device = hidden_states.device
         q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
-        cos, sin = self.rotary_emb(T, device)
-        cos = cos[:, :, :T, :self.head_dim].to(dtype)
-        sin = sin[:, :, :T, :self.head_dim].to(dtype)
         q, k = apply_rotary_emb(q, k, cos, sin)
         k = k.repeat_interleave(self.num_kv_groups, dim=1)
         v = v.repeat_interleave(self.num_kv_groups, dim=1)
@@ -147,9 +152,7 @@ class Grok2Attention(nn.Module):
         attn = torch.matmul(q, k.transpose(-2, -1)) / scale
         if self.attn_softcap > 0:
-            attn = attn / self.attn_softcap
-            attn = torch.tanh(attn)
-            attn = attn * self.attn_softcap
         causal = torch.triu(
             torch.full((T, T), float("-inf"), device=device, dtype=dtype),
@@ -158,7 +161,7 @@ class Grok2Attention(nn.Module):
         attn = attn + causal.unsqueeze(0).unsqueeze(0)
         if attention_mask is not None:
-            attn = attn + attention_mask.to(device, dtype)
         attn = F.softmax(attn, dim=-1).to(dtype)
         out = torch.matmul(attn, v)
@@ -166,6 +169,7 @@ class Grok2Attention(nn.Module):
         return self.o_proj(out)
 class Grok2Expert(nn.Module):
     def __init__(self, hidden_size, moe_intermediate_size):
         super().__init__()
@@ -177,12 +181,14 @@ class Grok2Expert(nn.Module):
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 class Grok2SparseMoE(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         self.num_experts = config.num_local_experts
         self.top_k = config.num_experts_per_tok
         self.router_softcap = config.router_logit_softcapping
         self.gate = nn.Linear(config.hidden_size, config.num_local_experts, bias=False)
         self.experts = nn.ModuleList([
             Grok2Expert(config.hidden_size, config.moe_intermediate_size)
@@ -191,14 +197,15 @@ class Grok2SparseMoE(nn.Module):
     def forward(self, x):
         B, T, H = x.shape
         x_flat = x.view(-1, H)
-        dtype = x_flat.dtype
         router_logits = self.gate(x_flat)
         if self.router_softcap > 0:
             router_logits = torch.tanh(router_logits / self.router_softcap) * self.router_softcap
-        router_weights = F.softmax(router_logits, dim=-1).to(dtype)
         top_weights, top_indices = router_weights.topk(self.top_k, dim=-1)
         top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True)
@@ -210,15 +217,16 @@ class Grok2SparseMoE(nn.Module):
                 mask = (expert_ids == e)
                 if not mask.any():
                     continue
                 expert_device = next(self.experts[e].parameters()).device
-                x_e = x_flat[mask].to(expert_device)
-                w_e = weights[mask].to(expert_device)
-                y_e = self.experts[e](x_e) * w_e
-                out[mask] += y_e.to(out.device)
         return out.view(B, T, H)
 class Grok2MLP(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
@@ -230,6 +238,7 @@ class Grok2MLP(nn.Module):
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
 class Grok2DecoderLayer(nn.Module):
     def __init__(self, config: Grok2Config, layer_idx: int):
         super().__init__()
@@ -243,22 +252,29 @@ class Grok2DecoderLayer(nn.Module):
         self.post_moe_norm    = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps)
     def forward(self, hidden_states, attention_mask=None, **kwargs):
         residual = hidden_states
         hidden_states = self.pre_attn_norm(hidden_states)
         hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask)
-        hidden_states = self.post_attn_norm(hidden_states)
-        hidden_states = residual + hidden_states
         residual = hidden_states
-        hidden_states = self.pre_moe_norm(hidden_states)
-        moe_out = self.block_sparse_moe(hidden_states)
-        mlp_out = self.mlp(hidden_states)
-        hidden_states = self.post_moe_norm(moe_out.to(mlp_out.device) + mlp_out)
-        hidden_states = residual + hidden_states
         return hidden_states
 class Grok2Model(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
@@ -276,6 +292,7 @@ class Grok2Model(nn.Module):
         return self.norm(hidden_states)
 class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = Grok2Config
     base_model_prefix = "model"
@@ -303,6 +320,8 @@ class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
         **kwargs,
     ):
         hidden_states = self.model(input_ids, attention_mask=attention_mask)
         logits = self.lm_head(hidden_states) * self.output_multiplier_scale
         if self.final_logit_softcapping > 0:
@@ -324,5 +343,6 @@ class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
         return {"input_ids": input_ids}
 AutoConfig.register("grok2", Grok2Config)
-AutoModelForCausalLM.register(Grok2Config, Grok1ForCausalLM)

 """
+modeling_grok2.py — Grok 2 for transformers, full multi-GPU support.
+Pure bf16 throughout. Device-aware at every operation.
 """
 import math
 from transformers import AutoConfig, AutoModelForCausalLM
+# ── Config ────────────────────────────────────────────────────────────────────
 class Grok2Config(PretrainedConfig):
     model_type = "grok2"
         )
+# ── RMSNorm ───────────────────────────────────────────────────────────────────
 class Grok2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
         self.eps = eps
     def forward(self, x):
+        # Stay in input dtype throughout
         variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+        return self.weight.to(x.device, x.dtype) * x
+# ── RoPE ──────────────────────────────────────────────────────────────────────
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
     return torch.cat([-x2, x1], dim=-1)
 def apply_rotary_emb(q, k, cos, sin):
     return (q * cos) + (rotate_half(q) * sin), \
            (k * cos) + (rotate_half(k) * sin)
         super().__init__()
         base = base * scaling_factor
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
         self._cached_len = 0
+    def _build_cache(self, seq_len, device, dtype):
         t = torch.arange(seq_len, device=device).float()
         freqs = torch.outer(t, self.inv_freq.to(device))
         emb = torch.cat([freqs, freqs], dim=-1)
+        self._cos = emb.cos().to(dtype)[None, None, :, :]
+        self._sin = emb.sin().to(dtype)[None, None, :, :]
         self._cached_len = seq_len
+        self._cached_device = device
+    def forward(self, seq_len, device, dtype):
+        if seq_len > self._cached_len or not hasattr(self, '_cached_device') or device != self._cached_device:
+            self._build_cache(seq_len, device, dtype)
+        return self._cos[:, :, :seq_len, :], self._sin[:, :, :seq_len, :]
+# ── Attention ─────────────────────────────────────────────────────────────────
 class Grok2Attention(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
     def forward(self, hidden_states, attention_mask=None, **kwargs):
         B, T, _ = hidden_states.shape
         device = hidden_states.device
+        dtype = hidden_states.dtype
         q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(T, device, dtype)
+        cos = cos[:, :, :T, :self.head_dim]
+        sin = sin[:, :, :T, :self.head_dim]
         q, k = apply_rotary_emb(q, k, cos, sin)
+        # GQA expand
         k = k.repeat_interleave(self.num_kv_groups, dim=1)
         v = v.repeat_interleave(self.num_kv_groups, dim=1)
         attn = torch.matmul(q, k.transpose(-2, -1)) / scale
         if self.attn_softcap > 0:
+            attn = torch.tanh(attn / self.attn_softcap) * self.attn_softcap
         causal = torch.triu(
             torch.full((T, T), float("-inf"), device=device, dtype=dtype),
         attn = attn + causal.unsqueeze(0).unsqueeze(0)
         if attention_mask is not None:
+            attn = attn + attention_mask.to(device=device, dtype=dtype)
         attn = F.softmax(attn, dim=-1).to(dtype)
         out = torch.matmul(attn, v)
         return self.o_proj(out)
+# ── MoE Expert ────────────────────────────────────────────────────────────────
 class Grok2Expert(nn.Module):
     def __init__(self, hidden_size, moe_intermediate_size):
         super().__init__()
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
+# ── Sparse MoE ────────────────────────────────────────────────────────────────
 class Grok2SparseMoE(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         self.num_experts = config.num_local_experts
         self.top_k = config.num_experts_per_tok
         self.router_softcap = config.router_logit_softcapping
         self.gate = nn.Linear(config.hidden_size, config.num_local_experts, bias=False)
         self.experts = nn.ModuleList([
             Grok2Expert(config.hidden_size, config.moe_intermediate_size)
     def forward(self, x):
         B, T, H = x.shape
+        device = x.device
+        dtype = x.dtype
         x_flat = x.view(-1, H)
         router_logits = self.gate(x_flat)
         if self.router_softcap > 0:
             router_logits = torch.tanh(router_logits / self.router_softcap) * self.router_softcap
+        router_weights = F.softmax(router_logits, dim=-1)
         top_weights, top_indices = router_weights.topk(self.top_k, dim=-1)
         top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True)
                 mask = (expert_ids == e)
                 if not mask.any():
                     continue
+                # Move tokens to expert's device, compute, move result back
                 expert_device = next(self.experts[e].parameters()).device
+                x_masked = x_flat[mask].to(device=expert_device, dtype=dtype)
+                expert_out = self.experts[e](x_masked).to(device=device, dtype=dtype)
+                out[mask] += weights[mask] * expert_out
         return out.view(B, T, H)
+# ── Dense MLP ─────────────────────────────────────────────────────────────────
 class Grok2MLP(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+# ── Decoder Layer ─────────────────────────────────────────────────────────────
 class Grok2DecoderLayer(nn.Module):
     def __init__(self, config: Grok2Config, layer_idx: int):
         super().__init__()
         self.post_moe_norm    = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps)
     def forward(self, hidden_states, attention_mask=None, **kwargs):
+        device = hidden_states.device
+        dtype  = hidden_states.dtype
+        # Attention block
         residual = hidden_states
         hidden_states = self.pre_attn_norm(hidden_states)
         hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask)
+        hidden_states = self.post_attn_norm(hidden_states.to(device=device, dtype=dtype))
+        hidden_states = residual + hidden_states.to(device=device, dtype=dtype)
+        # MoE + dense residual block
         residual = hidden_states
+        normed = self.pre_moe_norm(hidden_states)
+        moe_out = self.block_sparse_moe(normed)
+        mlp_out = self.mlp(normed)
+        combined = moe_out.to(device=device, dtype=dtype) + mlp_out.to(device=device, dtype=dtype)
+        hidden_states = self.post_moe_norm(combined)
+        hidden_states = residual + hidden_states.to(device=device, dtype=dtype)
         return hidden_states
+# ── Model ─────────────────────────────────────────────────────────────────────
 class Grok2Model(nn.Module):
     def __init__(self, config: Grok2Config):
         super().__init__()
         return self.norm(hidden_states)
+# ── CausalLM ──────────────────────────────────────────────────────────────────
 class Grok1ForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = Grok2Config
     base_model_prefix = "model"
         **kwargs,
     ):
         hidden_states = self.model(input_ids, attention_mask=attention_mask)
+        # Move to lm_head device
+        hidden_states = hidden_states.to(self.lm_head.weight.device)
         logits = self.lm_head(hidden_states) * self.output_multiplier_scale
         if self.final_logit_softcapping > 0:
         return {"input_ids": input_ids}
+# ── Register ──────────────────────────────────────────────────────────────────
 AutoConfig.register("grok2", Grok2Config)
+AutoModelForCausalLM.register(Grok2Config, Grok1ForCausalLM)