| """ |
| modeling_grok2.py β Grok 2 for transformers, full multi-GPU support. |
| Pure bf16 throughout. Device-aware at every operation. |
| """ |
|
|
| import math |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin |
| from transformers.modeling_outputs import CausalLMOutputWithPast |
| from transformers import AutoConfig, AutoModelForCausalLM |
|
|
|
|
| |
| class Grok2Config(PretrainedConfig): |
| model_type = "grok2" |
|
|
| def __init__( |
| self, |
| vocab_size=131072, |
| hidden_size=8192, |
| num_hidden_layers=64, |
| num_attention_heads=64, |
| num_key_value_heads=8, |
| intermediate_size=32768, |
| moe_intermediate_size=16384, |
| num_local_experts=8, |
| num_experts_per_tok=2, |
| max_position_embeddings=131072, |
| rope_theta=208533496.0, |
| rms_norm_eps=1e-5, |
| embedding_multiplier_scale=90.50966799187809, |
| output_multiplier_scale=0.5, |
| final_logit_softcapping=50.0, |
| attn_logit_softcapping=30.0, |
| router_logit_softcapping=30.0, |
| pad_token_id=0, |
| bos_token_id=1, |
| eos_token_id=2, |
| tie_word_embeddings=False, |
| **kwargs, |
| ): |
| self.vocab_size = vocab_size |
| self.hidden_size = hidden_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.num_key_value_heads = num_key_value_heads |
| self.head_dim = hidden_size // num_attention_heads |
| self.intermediate_size = intermediate_size |
| self.moe_intermediate_size = moe_intermediate_size |
| self.num_local_experts = num_local_experts |
| self.num_experts_per_tok = num_experts_per_tok |
| self.max_position_embeddings = max_position_embeddings |
| self.rope_theta = rope_theta |
| self.rms_norm_eps = rms_norm_eps |
| self.embedding_multiplier_scale = embedding_multiplier_scale |
| self.output_multiplier_scale = output_multiplier_scale |
| self.final_logit_softcapping = final_logit_softcapping |
| self.attn_logit_softcapping = attn_logit_softcapping |
| self.router_logit_softcapping = router_logit_softcapping |
| super().__init__( |
| pad_token_id=pad_token_id, |
| bos_token_id=bos_token_id, |
| eos_token_id=eos_token_id, |
| tie_word_embeddings=tie_word_embeddings, |
| **kwargs, |
| ) |
|
|
|
|
| |
| class Grok2RMSNorm(nn.Module): |
| def __init__(self, hidden_size, eps=1e-5): |
| super().__init__() |
| self.weight = nn.Parameter(torch.ones(hidden_size)) |
| self.eps = eps |
|
|
| def forward(self, x): |
| |
| variance = x.pow(2).mean(-1, keepdim=True) |
| x = x * torch.rsqrt(variance + self.eps) |
| return self.weight.to(x.device, x.dtype) * x |
|
|
|
|
| |
| def rotate_half(x): |
| x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:] |
| return torch.cat([-x2, x1], dim=-1) |
|
|
| def apply_rotary_emb(q, k, cos, sin): |
| return (q * cos) + (rotate_half(q) * sin), \ |
| (k * cos) + (rotate_half(k) * sin) |
|
|
| class Grok2RotaryEmbedding(nn.Module): |
| def __init__(self, dim, max_pos=131072, base=208533496.0, scaling_factor=16.0): |
| super().__init__() |
| base = base * scaling_factor |
| inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) |
| self.register_buffer("inv_freq", inv_freq, persistent=False) |
| self._cached_len = 0 |
|
|
| def _build_cache(self, seq_len, device, dtype): |
| t = torch.arange(seq_len, device=device).float() |
| freqs = torch.outer(t, self.inv_freq.to(device)) |
| emb = torch.cat([freqs, freqs], dim=-1) |
| self._cos = emb.cos().to(dtype)[None, None, :, :] |
| self._sin = emb.sin().to(dtype)[None, None, :, :] |
| self._cached_len = seq_len |
| self._cached_device = device |
|
|
| def forward(self, seq_len, device, dtype): |
| if seq_len > self._cached_len or not hasattr(self, '_cached_device') or device != self._cached_device: |
| self._build_cache(seq_len, device, dtype) |
| return self._cos[:, :, :seq_len, :], self._sin[:, :, :seq_len, :] |
|
|
|
|
| |
| class Grok2Attention(nn.Module): |
| def __init__(self, config: Grok2Config): |
| super().__init__() |
| self.num_heads = config.num_attention_heads |
| self.num_kv_heads = config.num_key_value_heads |
| self.head_dim = config.head_dim |
| self.num_kv_groups = self.num_heads // self.num_kv_heads |
| self.attn_softcap = config.attn_logit_softcapping |
|
|
| self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * config.head_dim, bias=False) |
| self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=False) |
| self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=False) |
| self.o_proj = nn.Linear(config.num_attention_heads * config.head_dim, config.hidden_size, bias=False) |
| self.rotary_emb = Grok2RotaryEmbedding(config.head_dim, config.max_position_embeddings, config.rope_theta) |
|
|
| def forward(self, hidden_states, attention_mask=None, **kwargs): |
| B, T, _ = hidden_states.shape |
| device = hidden_states.device |
| dtype = hidden_states.dtype |
|
|
| q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2) |
| k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2) |
| v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2) |
|
|
| cos, sin = self.rotary_emb(T, device, dtype) |
| cos = cos[:, :, :T, :self.head_dim] |
| sin = sin[:, :, :T, :self.head_dim] |
| q, k = apply_rotary_emb(q, k, cos, sin) |
|
|
| |
| k = k.repeat_interleave(self.num_kv_groups, dim=1) |
| v = v.repeat_interleave(self.num_kv_groups, dim=1) |
|
|
| scale = math.sqrt(self.head_dim) |
| attn = torch.matmul(q, k.transpose(-2, -1)) / scale |
|
|
| if self.attn_softcap > 0: |
| attn = torch.tanh(attn / self.attn_softcap) * self.attn_softcap |
|
|
| causal = torch.triu( |
| torch.full((T, T), float("-inf"), device=device, dtype=dtype), |
| diagonal=1 |
| ) |
| attn = attn + causal.unsqueeze(0).unsqueeze(0) |
|
|
| if attention_mask is not None: |
| attn = attn + attention_mask.to(device=device, dtype=dtype) |
|
|
| attn = F.softmax(attn, dim=-1).to(dtype) |
| out = torch.matmul(attn, v) |
| out = out.transpose(1, 2).contiguous().view(B, T, -1) |
| return self.o_proj(out) |
|
|
|
|
| |
| class Grok2Expert(nn.Module): |
| def __init__(self, hidden_size, moe_intermediate_size): |
| super().__init__() |
| self.w1 = nn.Linear(hidden_size, moe_intermediate_size, bias=False) |
| self.w2 = nn.Linear(moe_intermediate_size, hidden_size, bias=False) |
| self.w3 = nn.Linear(hidden_size, moe_intermediate_size, bias=False) |
|
|
| def forward(self, x): |
| device = self.w1.weight.device |
| x = x.to(device) |
| d1 = self.w1.weight.device |
| d3 = self.w3.weight.device |
| d2 = self.w2.weight.device |
| gate = F.silu(self.w1(x.to(d1))) |
| up = self.w3(x.to(d3)) |
| h = gate.to(d2) * up.to(d2) |
| return self.w2(h) |
|
|
|
|
| |
| class Grok2SparseMoE(nn.Module): |
| def __init__(self, config: Grok2Config): |
| super().__init__() |
| self.num_experts = config.num_local_experts |
| self.top_k = config.num_experts_per_tok |
| self.router_softcap = config.router_logit_softcapping |
|
|
| self.gate = nn.Linear(config.hidden_size, config.num_local_experts, bias=False) |
| self.experts = nn.ModuleList([ |
| Grok2Expert(config.hidden_size, config.moe_intermediate_size) |
| for _ in range(config.num_local_experts) |
| ]) |
|
|
| def forward(self, x): |
| B, T, H = x.shape |
| device = x.device |
| dtype = x.dtype |
| x_flat = x.view(-1, H) |
|
|
| router_logits = self.gate(x_flat) |
| if self.router_softcap > 0: |
| router_logits = torch.tanh(router_logits / self.router_softcap) * self.router_softcap |
|
|
| router_weights = F.softmax(router_logits, dim=-1) |
| top_weights, top_indices = router_weights.topk(self.top_k, dim=-1) |
| top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True) |
|
|
| out = torch.zeros_like(x_flat) |
| for k in range(self.top_k): |
| expert_ids = top_indices[:, k] |
| weights = top_weights[:, k].unsqueeze(-1) |
| for e in range(self.num_experts): |
| mask = (expert_ids == e) |
| if not mask.any(): |
| continue |
| |
| expert_device = next(self.experts[e].parameters()).device |
| x_masked = x_flat[mask].to(device=expert_device, dtype=dtype) |
| expert_out = self.experts[e](x_masked).to(device=device, dtype=dtype) |
| out[mask] += weights[mask] * expert_out |
|
|
| return out.view(B, T, H) |
|
|
|
|
| |
| class Grok2MLP(nn.Module): |
| def __init__(self, config: Grok2Config): |
| super().__init__() |
| self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) |
| self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) |
| self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) |
|
|
| def forward(self, x): |
| return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) |
|
|
|
|
| |
| class Grok2DecoderLayer(nn.Module): |
| def __init__(self, config: Grok2Config, layer_idx: int): |
| super().__init__() |
| self.layer_idx = layer_idx |
| self.pre_attn_norm = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps) |
| self.self_attn = Grok2Attention(config) |
| self.post_attn_norm = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps) |
| self.pre_moe_norm = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps) |
| self.block_sparse_moe = Grok2SparseMoE(config) |
| self.mlp = Grok2MLP(config) |
| self.post_moe_norm = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps) |
|
|
| def forward(self, hidden_states, attention_mask=None, **kwargs): |
| device = hidden_states.device |
| dtype = hidden_states.dtype |
|
|
| |
| residual = hidden_states |
| hidden_states = self.pre_attn_norm(hidden_states) |
| hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask) |
| hidden_states = self.post_attn_norm(hidden_states.to(device=device, dtype=dtype)) |
| hidden_states = residual + hidden_states.to(device=device, dtype=dtype) |
|
|
| |
| residual = hidden_states |
| normed = self.pre_moe_norm(hidden_states) |
| moe_out = self.block_sparse_moe(normed) |
| mlp_out = self.mlp(normed) |
| combined = moe_out.to(device=device, dtype=dtype) + mlp_out.to(device=device, dtype=dtype) |
| hidden_states = self.post_moe_norm(combined) |
| hidden_states = residual + hidden_states.to(device=device, dtype=dtype) |
|
|
| return hidden_states |
|
|
|
|
| |
| class Grok2Model(nn.Module): |
| def __init__(self, config: Grok2Config): |
| super().__init__() |
| self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) |
| self.embedding_multiplier_scale = config.embedding_multiplier_scale |
| self.layers = nn.ModuleList([ |
| Grok2DecoderLayer(config, i) for i in range(config.num_hidden_layers) |
| ]) |
| self.norm = Grok2RMSNorm(config.hidden_size, config.rms_norm_eps) |
|
|
| def forward(self, input_ids, attention_mask=None, **kwargs): |
| hidden_states = self.embed_tokens(input_ids) * self.embedding_multiplier_scale |
| for layer in self.layers: |
| hidden_states = layer(hidden_states, attention_mask=attention_mask) |
| return self.norm(hidden_states) |
|
|
|
|
| |
| class Grok1ForCausalLM(PreTrainedModel, GenerationMixin): |
| config_class = Grok2Config |
| base_model_prefix = "model" |
| supports_gradient_checkpointing = False |
|
|
| def __init__(self, config: Grok2Config): |
| super().__init__(config) |
| self.model = Grok2Model(config) |
| self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) |
| self.output_multiplier_scale = config.output_multiplier_scale |
| self.final_logit_softcapping = config.final_logit_softcapping |
| self.post_init() |
|
|
| def get_input_embeddings(self): return self.model.embed_tokens |
| def get_output_embeddings(self): return self.lm_head |
|
|
| def forward( |
| self, |
| input_ids=None, |
| attention_mask=None, |
| past_key_values=None, |
| inputs_embeds=None, |
| labels=None, |
| use_cache=None, |
| **kwargs, |
| ): |
| hidden_states = self.model(input_ids, attention_mask=attention_mask) |
| |
| hidden_states = hidden_states.to(self.lm_head.weight.device) |
| logits = self.lm_head(hidden_states) * self.output_multiplier_scale |
|
|
| if self.final_logit_softcapping > 0: |
| logits = torch.tanh(logits / self.final_logit_softcapping) * self.final_logit_softcapping |
|
|
| loss = None |
| if labels is not None: |
| shift_logits = logits[..., :-1, :].contiguous() |
| shift_labels = labels[..., 1:].contiguous() |
| loss = F.cross_entropy( |
| shift_logits.view(-1, shift_logits.size(-1)), |
| shift_labels.view(-1), |
| ignore_index=-100, |
| ) |
|
|
| return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=None) |
|
|
| def prepare_inputs_for_generation(self, input_ids, **kwargs): |
| return {"input_ids": input_ids} |
|
|
|
|
| |
| AutoConfig.register("grok2", Grok2Config) |
| AutoModelForCausalLM.register(Grok2Config, Grok1ForCausalLM) |
|
|