zai-org
/

glm-4-9b-chat

@@ -17,6 +17,7 @@
   "apply_residual_connection_post_layernorm": false,
   "attention_dropout": 0.0,
   "attention_softmax_in_fp32": true,
   "bias_dropout_fusion": true,
   "ffn_hidden_size": 13696,
   "fp32_residual_connection": false,

   "apply_residual_connection_post_layernorm": false,
   "attention_dropout": 0.0,
   "attention_softmax_in_fp32": true,
+  "attn_implementation": "sdpa",
   "bias_dropout_fusion": true,
   "ffn_hidden_size": 13696,
   "fp32_residual_connection": false,

modeling_chatglm.py CHANGED Viewed

@@ -40,6 +40,7 @@ logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
 _CONFIG_FOR_DOC = "ChatGLMConfig"
 def default_init(cls, *args, **kwargs):
     return cls(*args, **kwargs)
@@ -183,93 +184,99 @@ class CoreAttention(torch.nn.Module):
         self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2:
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.transpose(1, 2).contiguous()
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
-            # [b, np, sq, hn] -> [b * np, sq, hn]
-            query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
-            # [b, np, sk, hn] -> [b * np, sk, hn]
-            key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer,  # [b * np, sq, hn]
-                key_layer.transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
             if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # query layer shape: [b * np, sq, hn]
-            # value layer shape: [b, np, sk, hn]
-            # attention shape: [b, np, sq, sk]
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
-            # change view [b * np, sk, hn]
-            value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer)
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [b, sq, np, hn]
-            context_layer = context_layer.transpose(1, 2).contiguous()
-            # [b, sq, np, hn] --> [b, sq, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
         return context_layer
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
@@ -299,7 +306,7 @@ class SelfAttention(torch.nn.Module):
                                          device=device, **_config_to_kwargs(config)
                                          )
-        self.core_attention = CoreAttention(config, self.layer_number)
         # Output.
         self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
@@ -378,7 +385,8 @@ class SelfAttention(torch.nn.Module):
             value_layer = torch.cat((cache_v, value_layer), dim=2)
         if use_cache:
             if kv_cache is None:
-                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)), dim=1)
             else:
                 kv_cache = (key_layer, value_layer)
         else:
@@ -724,7 +732,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,

 _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
 _CONFIG_FOR_DOC = "ChatGLMConfig"
 def default_init(cls, *args, **kwargs):
     return cls(*args, **kwargs)
         self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
+        # [b, np, sq, hn] -> [b * np, sq, hn]
+        query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
+        # [b, np, sk, hn] -> [b * np, sk, hn]
+        key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = torch.empty(
+            output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+            device=query_layer.device
+        )
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer,  # [b * np, sq, hn]
+            key_layer.transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+        # attention scores and attention mask [b, np, sq, sk]
+        if self.attention_softmax_in_fp32:
+            attention_scores = attention_scores.float()
+        if self.coeff is not None:
+            attention_scores = attention_scores * self.coeff
+        if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+            attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                        device=attention_scores.device, dtype=torch.bool)
+            attention_mask.tril_()
+            attention_mask = ~attention_mask
+        if attention_mask is not None:
+            attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+        attention_probs = F.softmax(attention_scores, dim=-1)
+        attention_probs = attention_probs.type_as(value_layer)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+        # query layer shape: [b * np, sq, hn]
+        # value layer shape: [b, np, sk, hn]
+        # attention shape: [b, np, sq, sk]
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
+        # change view [b * np, sk, hn]
+        value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        # [b, np, sq, hn] --> [b, sq, np, hn]
+        context_layer = context_layer.transpose(1, 2).contiguous()
+        # [b, sq, np, hn] --> [b, sq, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(*new_context_layer_shape)
+        return context_layer
+class SdpaAttention(CoreAttention):
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                             is_causal=True)
+        else:
             if attention_mask is not None:
+                attention_mask = ~attention_mask
+            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                             attention_mask)
+        context_layer = context_layer.transpose(1, 2).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+        context_layer = context_layer.reshape(*new_context_layer_shape)
         return context_layer
+CORE_ATTENTION_CLASSES = {
+    "eager": CoreAttention,
+    "sdpa": SdpaAttention,
+}
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
                                          device=device, **_config_to_kwargs(config)
                                          )
+        self.core_attention = CORE_ATTENTION_CLASSES[config._attn_implementation](config, self.layer_number)
         # Output.
         self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
             value_layer = torch.cat((cache_v, value_layer), dim=2)
         if use_cache:
             if kv_cache is None:
+                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)),
+                                     dim=1)
             else:
                 kv_cache = (key_layer, value_layer)
         else:
             config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
         )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
+                                              original_impl=config.original_rope,
                                               device=device, dtype=config.torch_dtype)
         self.encoder = init_method(GLMTransformer, config, **init_kwargs)
         self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,