evilfreelancer
/

ruGPT3XL

@@ -40,6 +40,7 @@ class RuGPT3XLConfig(PretrainedConfig):
         sparse_num_local_blocks=8,
         sparse_num_global_blocks=1,
         sparse_num_different_global_patterns=8,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -60,6 +61,7 @@ class RuGPT3XLConfig(PretrainedConfig):
         self.sparse_num_local_blocks = sparse_num_local_blocks
         self.sparse_num_global_blocks = sparse_num_global_blocks
         self.sparse_num_different_global_patterns = sparse_num_different_global_patterns
         super().__init__(
             bos_token_id=bos_token_id,

         sparse_num_local_blocks=8,
         sparse_num_global_blocks=1,
         sparse_num_different_global_patterns=8,
+        attn_implementation="sdpa",
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.sparse_num_local_blocks = sparse_num_local_blocks
         self.sparse_num_global_blocks = sparse_num_global_blocks
         self.sparse_num_different_global_patterns = sparse_num_different_global_patterns
+        self.attn_implementation = attn_implementation
         super().__init__(
             bos_token_id=bos_token_id,

modeling_rugpt3xl.py CHANGED Viewed

@@ -2,7 +2,8 @@
 GPT-3-style decoder-only transformer (1.3B) trained on Russian text.
 Architecture: absolute position embeddings, pre-norm layers, GELU activation,
-tied LM head.
 """
 import math
@@ -107,17 +108,37 @@ class RuGPT3XLAttention(nn.Module):
         if past_key_value is not None:
             key, value = past_key_value.update(key, value, self.layer_idx)
-        attn_weights = torch.matmul(query, key.transpose(2, 3)) * self.scale
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
-            query.dtype
-        )
-        attn_weights = self.attn_dropout(attn_weights)
-        attn_output = torch.matmul(attn_weights, value)
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

 GPT-3-style decoder-only transformer (1.3B) trained on Russian text.
 Architecture: absolute position embeddings, pre-norm layers, GELU activation,
+tied LM head. Attention: config.attn_implementation "sdpa" uses
+scaled_dot_product_attention (Flash/Memory-efficient/Triton backends on CUDA).
 """
 import math
         if past_key_value is not None:
             key, value = past_key_value.update(key, value, self.layer_idx)
+        attn_impl = getattr(self.config, "attn_implementation", "sdpa")
+        use_sdpa = attn_impl == "sdpa" and not output_attentions
+        if use_sdpa:
+            dropout_p = self.attn_dropout.p if self.training else 0.0
+            sdpa_mask = attention_mask
+            if sdpa_mask is not None:
+                sdpa_mask = sdpa_mask.to(dtype=query.dtype)
+            attn_output = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=sdpa_mask,
+                dropout_p=dropout_p,
+                is_causal=False,
+            )
+            attn_weights = None
+        else:
+            attn_weights = torch.matmul(query, key.transpose(2, 3)) * self.scale
+            if attention_mask is not None:
+                attn_weights = attn_weights + attention_mask
+            attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+                query.dtype
+            )
+            attn_weights = self.attn_dropout(attn_weights)
+            attn_output = torch.matmul(attn_weights, value)
+            attn_weights = attn_weights if output_attentions else None
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)