Synthyra
/

DPLM2-650M

@@ -1,3 +1,5 @@
 import torch
 import torch._inductor.config as inductor_config
 import torch._dynamo as dynamo
@@ -429,7 +431,8 @@ Contains: AttentionBackend enum, backend resolution, mask creation,
 flex attention helpers, flash kernel detection/dispatch, and pad/unpad utilities.
 """
 from enum import Enum
-from typing import Optional
 import torch
 import torch.nn as nn
@@ -447,7 +450,12 @@ _compiled_flex_attention = None
 def _get_flex_attention_fn():
-    """Return flex_attention callable: compiled (fused kernel) by default, or eager when debug flag is set."""
     global _compiled_flex_attention
     if flex_attention is None:
         return None
@@ -455,12 +463,15 @@ def _get_flex_attention_fn():
     if getattr(flex_mod, "_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG", False):
         return flex_attention
     if _compiled_flex_attention is None:
-        _compiled_flex_attention = torch.compile(flex_attention)
     return _compiled_flex_attention
 ### Kernels Flash Attention Detection
-def _infer_kernels_flash_variant(kernel) -> str | None:
     if hasattr(kernel, "fwd") and hasattr(kernel, "varlen_fwd"):
         return "flash_attn2"
     if hasattr(kernel, "flash_attn_func") and hasattr(kernel, "flash_attn_varlen_func"):
@@ -576,7 +587,7 @@ class IndexFirstAxis(torch.autograd.Function):
         ).reshape(-1, *other_shape)
     @staticmethod
-    def backward(ctx, grad_output) -> tuple[torch.Tensor, None]:
         (indices,) = ctx.saved_tensors
         assert grad_output.ndim >= 2
         other_shape = grad_output.shape[1:]
@@ -599,7 +610,7 @@ class IndexPutFirstAxis(torch.autograd.Function):
         return output
     @staticmethod
-    def backward(ctx, grad_output) -> tuple[torch.Tensor, None, None]:
         (indices,) = ctx.saved_tensors
         return grad_output[indices], None, None
@@ -618,7 +629,7 @@ def _unpad_input(
     key_layer: torch.Tensor,
     value_layer: torch.Tensor,
     attention_mask_2d: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, tuple[torch.Tensor, torch.Tensor], tuple[int, int]]:
     batch_size, seq_len, num_heads, head_dim = query_layer.shape
     seqlens = attention_mask_2d.sum(dim=1).int()
     cu_seqlens = F.pad(seqlens.cumsum(0, dtype=torch.int32), (1, 0))
@@ -634,7 +645,7 @@ def kernels_flash_attention_func(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    attention_mask_2d: torch.Tensor | None = None,
     causal: bool = False,
 ) -> torch.Tensor:
     assert FLASH_KERNEL is not None, "Kernel Flash Attention is not available in this environment."
@@ -707,7 +718,7 @@ def get_attention_mask(
     seq_len: int,
     device: torch.device,
     attention_mask: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor | None, torch.Tensor | None, "BlockMask | None"]:
     """Build padding masks once for all encoder layers.
     Returns (attention_mask_2d, attention_mask_4d, flex_block_mask).
@@ -838,7 +849,7 @@ class DPLM2MaskedLMOutput(ModelOutput):
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor, ...]] = None
     attentions: Optional[Tuple[torch.Tensor, ...]] = None
-    s_max: Optional[Tuple[list[torch.Tensor], ...]] = None
 @dataclass
@@ -846,7 +857,7 @@ class DPLM2EncoderOutput(ModelOutput):
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor, ...]] = None
     attentions: Optional[Tuple[torch.Tensor, ...]] = None
-    s_max: Optional[Tuple[list[torch.Tensor], ...]] = None
 class DPLM2Config(EsmConfig):
@@ -986,13 +997,13 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
         type_ids: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor] | None]:
         batch_size, seq_length = hidden_states.shape[:-1]
         hidden_shape = (batch_size, seq_length, -1, self.attention_head_size)
         query_BHLD = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
@@ -1019,12 +1030,12 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor] | None]:
         if output_attentions:
             return self._manual_attn(query_BHLD, key_BHLD, value_BHLD, attention_mask_4d, output_s_max)
@@ -1041,7 +1052,7 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
         return attn_output, attn_weights, s_max
     @torch.no_grad()
-    def _compute_s_max(self, query_BHLD: torch.Tensor, key_BHLD: torch.Tensor) -> list[torch.Tensor]:
         q_norm = torch.linalg.vector_norm(query_BHLD, dim=-1)
         k_norm = torch.linalg.vector_norm(key_BHLD, dim=-1)
         s_max_bound = (q_norm.max(dim=-1).values * k_norm.max(dim=-1).values).max(dim=0).values
@@ -1052,9 +1063,9 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_4d: torch.Tensor | None = None,
         output_s_max: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor] | None]:
         attn_weights = torch.matmul(query_BHLD, key_BHLD.transpose(-1, -2))
         if attention_mask_4d is not None:
             attn_weights = attn_weights.masked_fill(attention_mask_4d.logical_not(), float("-inf"))
@@ -1071,8 +1082,8 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, None]:
         query_BLHD = query_BHLD.transpose(1, 2).contiguous()
         key_BLHD = key_BHLD.transpose(1, 2).contiguous()
         value_BLHD = value_BHLD.transpose(1, 2).contiguous()
@@ -1087,8 +1098,8 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        flex_block_mask: "BlockMask | None" = None,
-    ) -> tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=1.0)
@@ -1099,8 +1110,8 @@ class ModifiedEsmSelfAttention(EsmSelfAttention):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_4d: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, None]:
         context_BHLD = F.scaled_dot_product_attention(
             query_BHLD, key_BHLD, value_BHLD,
             attn_mask=attention_mask_4d,
@@ -1120,13 +1131,13 @@ class ModifiedEsmAttention(EsmAttention):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
         type_ids: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor] | None]:
         hidden_states_ln = self.LayerNorm(hidden_states)
         attn_output, attn_weights, s_max = self.self(
             hidden_states_ln,
@@ -1154,13 +1165,13 @@ class ModifiedEsmLayer(EsmLayer):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
         type_ids: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor] | None]:
         attention_output, attn_weights, s_max = self.attention(
             hidden_states,
             attention_mask_2d=attention_mask_2d,

+from __future__ import annotations
 import torch
 import torch._inductor.config as inductor_config
 import torch._dynamo as dynamo
 flex attention helpers, flash kernel detection/dispatch, and pad/unpad utilities.
 """
 from enum import Enum
+from functools import partial
+from typing import Dict, List, Optional, Tuple
 import torch
 import torch.nn as nn
 def _get_flex_attention_fn():
+    """Return flex_attention callable: compiled (fused kernel) by default, or eager when debug flag is set.
+    Uses kernel_options={"BACKEND": "FLASH"} to prefer Flash Attention 4 (FA4)
+    on Hopper/Blackwell GPUs (PyTorch 2.11+). Automatically falls back to Triton
+    on older hardware.
+    """
     global _compiled_flex_attention
     if flex_attention is None:
         return None
     if getattr(flex_mod, "_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG", False):
         return flex_attention
     if _compiled_flex_attention is None:
+        _compiled_flex_attention = torch.compile(
+            partial(flex_attention, kernel_options={"BACKEND": "FLASH"}),
+            dynamic=False,
+        )
     return _compiled_flex_attention
 ### Kernels Flash Attention Detection
+def _infer_kernels_flash_variant(kernel) -> Optional[str]:
     if hasattr(kernel, "fwd") and hasattr(kernel, "varlen_fwd"):
         return "flash_attn2"
     if hasattr(kernel, "flash_attn_func") and hasattr(kernel, "flash_attn_varlen_func"):
         ).reshape(-1, *other_shape)
     @staticmethod
+    def backward(ctx, grad_output) -> Tuple[torch.Tensor, None]:
         (indices,) = ctx.saved_tensors
         assert grad_output.ndim >= 2
         other_shape = grad_output.shape[1:]
         return output
     @staticmethod
+    def backward(ctx, grad_output) -> Tuple[torch.Tensor, None, None]:
         (indices,) = ctx.saved_tensors
         return grad_output[indices], None, None
     key_layer: torch.Tensor,
     value_layer: torch.Tensor,
     attention_mask_2d: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
     batch_size, seq_len, num_heads, head_dim = query_layer.shape
     seqlens = attention_mask_2d.sum(dim=1).int()
     cu_seqlens = F.pad(seqlens.cumsum(0, dtype=torch.int32), (1, 0))
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
+    attention_mask_2d: Optional[torch.Tensor] = None,
     causal: bool = False,
 ) -> torch.Tensor:
     assert FLASH_KERNEL is not None, "Kernel Flash Attention is not available in this environment."
     seq_len: int,
     device: torch.device,
     attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[BlockMask]]:
     """Build padding masks once for all encoder layers.
     Returns (attention_mask_2d, attention_mask_4d, flex_block_mask).
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor, ...]] = None
     attentions: Optional[Tuple[torch.Tensor, ...]] = None
+    s_max: Optional[Tuple[List[torch.Tensor], ...]] = None
 @dataclass
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor, ...]] = None
     attentions: Optional[Tuple[torch.Tensor, ...]] = None
+    s_max: Optional[Tuple[List[torch.Tensor], ...]] = None
 class DPLM2Config(EsmConfig):
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
         type_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
         batch_size, seq_length = hidden_states.shape[:-1]
         hidden_shape = (batch_size, seq_length, -1, self.attention_head_size)
         query_BHLD = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
         if output_attentions:
             return self._manual_attn(query_BHLD, key_BHLD, value_BHLD, attention_mask_4d, output_s_max)
         return attn_output, attn_weights, s_max
     @torch.no_grad()
+    def _compute_s_max(self, query_BHLD: torch.Tensor, key_BHLD: torch.Tensor) -> List[torch.Tensor]:
         q_norm = torch.linalg.vector_norm(query_BHLD, dim=-1)
         k_norm = torch.linalg.vector_norm(key_BHLD, dim=-1)
         s_max_bound = (q_norm.max(dim=-1).values * k_norm.max(dim=-1).values).max(dim=0).values
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_4d: Optional[torch.Tensor] = None,
         output_s_max: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]]]:
         attn_weights = torch.matmul(query_BHLD, key_BHLD.transpose(-1, -2))
         if attention_mask_4d is not None:
             attn_weights = attn_weights.masked_fill(attention_mask_4d.logical_not(), float("-inf"))
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, None]:
         query_BLHD = query_BHLD.transpose(1, 2).contiguous()
         key_BLHD = key_BHLD.transpose(1, 2).contiguous()
         value_BLHD = value_BHLD.transpose(1, 2).contiguous()
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        flex_block_mask: Optional[BlockMask] = None,
+    ) -> Tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=1.0)
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, None]:
         context_BHLD = F.scaled_dot_product_attention(
             query_BHLD, key_BHLD, value_BHLD,
             attn_mask=attention_mask_4d,
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
         type_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
         hidden_states_ln = self.LayerNorm(hidden_states)
         attn_output, attn_weights, s_max = self.self(
             hidden_states_ln,
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
         type_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
         attention_output, attn_weights, s_max = self.attention(
             hidden_states,
             attention_mask_2d=attention_mask_2d,