hpcai-tech
/

grok-1

Text Generation

Transformers

PyTorch

custom_code

Model card Files Files and versions

xet

Community

ver217 commited on Mar 20, 2024

Commit

f27fd70

1 Parent(s): f7c6e7f

[hotfix] update gqa impl

Browse files

Files changed (1) hide show

modeling_grok1.py +20 -0

modeling_grok1.py CHANGED Viewed

@@ -74,6 +74,21 @@ def load_balancing_loss_func(
     ) * (num_experts**2)
 class RMSNorm(nn.Module):
     def __init__(
         self,
@@ -194,6 +209,7 @@ class MultiHeadAttention(nn.Module):
         if num_key_value_heads is None:
             num_key_value_heads = num_heads
         self.num_key_value_heads = num_key_value_heads
         self.attn_output_multiplier = attn_output_multiplier
         self.max_attn_val = max_attn_val
@@ -259,6 +275,10 @@ class MultiHeadAttention(nn.Module):
         past_key_value = (key_states, value_states) if use_cache else None
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)).to(
             torch.float
         )

     ) * (num_experts**2)
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class RMSNorm(nn.Module):
     def __init__(
         self,
         if num_key_value_heads is None:
             num_key_value_heads = num_heads
         self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.attn_output_multiplier = attn_output_multiplier
         self.max_attn_val = max_attn_val
         past_key_value = (key_states, value_states) if use_cache else None
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)).to(
             torch.float
         )