Synthyra
/

ESMplusplus_large

@@ -1,3 +1,5 @@
 import torch
 import torch._inductor.config as inductor_config
 import torch._dynamo as dynamo
@@ -429,7 +431,8 @@ Contains: AttentionBackend enum, backend resolution, mask creation,
 flex attention helpers, flash kernel detection/dispatch, and pad/unpad utilities.
 """
 from enum import Enum
-from typing import Optional
 import torch
 import torch.nn as nn
@@ -447,7 +450,12 @@ _compiled_flex_attention = None
 def _get_flex_attention_fn():
-    """Return flex_attention callable: compiled (fused kernel) by default, or eager when debug flag is set."""
     global _compiled_flex_attention
     if flex_attention is None:
         return None
@@ -455,12 +463,15 @@ def _get_flex_attention_fn():
     if getattr(flex_mod, "_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG", False):
         return flex_attention
     if _compiled_flex_attention is None:
-        _compiled_flex_attention = torch.compile(flex_attention)
     return _compiled_flex_attention
 ### Kernels Flash Attention Detection
-def _infer_kernels_flash_variant(kernel) -> str | None:
     if hasattr(kernel, "fwd") and hasattr(kernel, "varlen_fwd"):
         return "flash_attn2"
     if hasattr(kernel, "flash_attn_func") and hasattr(kernel, "flash_attn_varlen_func"):
@@ -576,7 +587,7 @@ class IndexFirstAxis(torch.autograd.Function):
         ).reshape(-1, *other_shape)
     @staticmethod
-    def backward(ctx, grad_output) -> tuple[torch.Tensor, None]:
         (indices,) = ctx.saved_tensors
         assert grad_output.ndim >= 2
         other_shape = grad_output.shape[1:]
@@ -599,7 +610,7 @@ class IndexPutFirstAxis(torch.autograd.Function):
         return output
     @staticmethod
-    def backward(ctx, grad_output) -> tuple[torch.Tensor, None, None]:
         (indices,) = ctx.saved_tensors
         return grad_output[indices], None, None
@@ -618,7 +629,7 @@ def _unpad_input(
     key_layer: torch.Tensor,
     value_layer: torch.Tensor,
     attention_mask_2d: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, tuple[torch.Tensor, torch.Tensor], tuple[int, int]]:
     batch_size, seq_len, num_heads, head_dim = query_layer.shape
     seqlens = attention_mask_2d.sum(dim=1).int()
     cu_seqlens = F.pad(seqlens.cumsum(0, dtype=torch.int32), (1, 0))
@@ -634,7 +645,7 @@ def kernels_flash_attention_func(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    attention_mask_2d: torch.Tensor | None = None,
     causal: bool = False,
 ) -> torch.Tensor:
     assert FLASH_KERNEL is not None, "Kernel Flash Attention is not available in this environment."
@@ -707,7 +718,7 @@ def get_attention_mask(
     seq_len: int,
     device: torch.device,
     attention_mask: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor | None, torch.Tensor | None, "BlockMask | None"]:
     """Build padding masks once for all encoder layers.
     Returns (attention_mask_2d, attention_mask_4d, flex_block_mask).
@@ -783,7 +794,7 @@ class ESMplusplusConfig(PretrainedConfig):
         num_attention_heads: int = 15,
         num_hidden_layers: int = 30,
         num_labels: int = 2,
-        problem_type: str | None = None,
         dropout: float = 0.0,
         initializer_range: float = 0.02,
         attn_backend: str = "sdpa",
@@ -1057,12 +1068,12 @@ class MultiHeadAttention(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor] | None]:
         qkv_BLD3 = self.layernorm_qkv(x)
         query_BLD, key_BLD, value_BLD = torch.chunk(qkv_BLD3, 3, dim=-1)
         query_BLD, key_BLD = (
@@ -1089,12 +1100,12 @@ class MultiHeadAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor] | None]:
         if output_attentions:
             return self._manual_attn(query_BHLD, key_BHLD, value_BHLD, attention_mask_4d, output_s_max)
@@ -1111,7 +1122,7 @@ class MultiHeadAttention(nn.Module):
         return attn_output, attn_weights, s_max
     @torch.no_grad()
-    def _compute_s_max(self, query_BHLD: torch.Tensor, key_BHLD: torch.Tensor) -> list[torch.Tensor]:
         q_norm = torch.linalg.vector_norm(query_BHLD, dim=-1)
         k_norm = torch.linalg.vector_norm(key_BHLD, dim=-1)
         s_max_bound = (q_norm.max(dim=-1).values * k_norm.max(dim=-1).values).max(dim=0).values * self.scale
@@ -1122,9 +1133,9 @@ class MultiHeadAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_4d: torch.Tensor | None = None,
         output_s_max: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor, list[torch.Tensor] | None]:
         attn_weights = torch.matmul(query_BHLD, key_BHLD.transpose(-2, -1)) * self.scale
         if attention_mask_4d is not None:
             attn_weights = attn_weights.masked_fill(attention_mask_4d.logical_not(), float("-inf"))
@@ -1139,8 +1150,8 @@ class MultiHeadAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, None]:
         query_BLHD = query_BHLD.transpose(1, 2).contiguous()
         key_BLHD = key_BHLD.transpose(1, 2).contiguous()
         value_BLHD = value_BHLD.transpose(1, 2).contiguous()
@@ -1155,8 +1166,8 @@ class MultiHeadAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        flex_block_mask: "BlockMask | None" = None,
-    ) -> tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=self.scale)
@@ -1167,8 +1178,8 @@ class MultiHeadAttention(nn.Module):
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
-        attention_mask_4d: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, None]:
         context_BHLD = F.scaled_dot_product_attention(
             query_BHLD, key_BHLD, value_BHLD, attn_mask=attention_mask_4d, scale=self.scale,
         )
@@ -1214,12 +1225,12 @@ class UnifiedTransformerBlock(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attention_mask_2d: torch.Tensor | None = None,
-        attention_mask_4d: torch.Tensor | None = None,
-        flex_block_mask: "BlockMask | None" = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None, list[torch.Tensor] | None]:
         attn_output, attn_weights, s_max = self.attn(
             x,
             attention_mask_2d=attention_mask_2d,
@@ -1240,7 +1251,7 @@ class TransformerOutput(ModelOutput):
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor]] = None
     attentions: Optional[Tuple[torch.Tensor]] = None
-    s_max: Optional[Tuple[list[torch.Tensor], ...]] = None
 @dataclass
@@ -1251,7 +1262,7 @@ class ESMplusplusOutput(ModelOutput):
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor]] = None
     attentions: Optional[Tuple[torch.Tensor]] = None
-    s_max: Optional[Tuple[list[torch.Tensor], ...]] = None
 ### Transformer Stack
@@ -1772,7 +1783,7 @@ def get_esmc_checkpoint_path(model: str) -> Path:
 def _load_esmc_checkpoint_model(
     config: ESMplusplusConfig,
     model: str,
-    device: torch.device | str = "cpu",
 ) -> ESMplusplusForMaskedLM:
     key = _resolve_esmc_checkpoint_key(model)
     spec = _ESMC_CHECKPOINT_SPECS[key]
@@ -1795,7 +1806,7 @@ def _load_esmc_checkpoint_model(
     return model_obj
-def ESMplusplus_300M(device: torch.device | str = "cpu"):
     config = ESMplusplusConfig(
         hidden_size=960,
         num_attention_heads=15,
@@ -1804,7 +1815,7 @@ def ESMplusplus_300M(device: torch.device | str = "cpu"):
     return _load_esmc_checkpoint_model(config=config, model="esmc-300", device=device)
-def ESMplusplus_600M(device: torch.device | str = "cpu"):
     config = ESMplusplusConfig(
         hidden_size=1152,
         num_attention_heads=18,

+from __future__ import annotations
 import torch
 import torch._inductor.config as inductor_config
 import torch._dynamo as dynamo
 flex attention helpers, flash kernel detection/dispatch, and pad/unpad utilities.
 """
 from enum import Enum
+from functools import partial
+from typing import Dict, List, Optional, Tuple
 import torch
 import torch.nn as nn
 def _get_flex_attention_fn():
+    """Return flex_attention callable: compiled (fused kernel) by default, or eager when debug flag is set.
+    Uses kernel_options={"BACKEND": "FLASH"} to prefer Flash Attention 4 (FA4)
+    on Hopper/Blackwell GPUs (PyTorch 2.11+). Automatically falls back to Triton
+    on older hardware.
+    """
     global _compiled_flex_attention
     if flex_attention is None:
         return None
     if getattr(flex_mod, "_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG", False):
         return flex_attention
     if _compiled_flex_attention is None:
+        _compiled_flex_attention = torch.compile(
+            partial(flex_attention, kernel_options={"BACKEND": "FLASH"}),
+            dynamic=False,
+        )
     return _compiled_flex_attention
 ### Kernels Flash Attention Detection
+def _infer_kernels_flash_variant(kernel) -> Optional[str]:
     if hasattr(kernel, "fwd") and hasattr(kernel, "varlen_fwd"):
         return "flash_attn2"
     if hasattr(kernel, "flash_attn_func") and hasattr(kernel, "flash_attn_varlen_func"):
         ).reshape(-1, *other_shape)
     @staticmethod
+    def backward(ctx, grad_output) -> Tuple[torch.Tensor, None]:
         (indices,) = ctx.saved_tensors
         assert grad_output.ndim >= 2
         other_shape = grad_output.shape[1:]
         return output
     @staticmethod
+    def backward(ctx, grad_output) -> Tuple[torch.Tensor, None, None]:
         (indices,) = ctx.saved_tensors
         return grad_output[indices], None, None
     key_layer: torch.Tensor,
     value_layer: torch.Tensor,
     attention_mask_2d: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
     batch_size, seq_len, num_heads, head_dim = query_layer.shape
     seqlens = attention_mask_2d.sum(dim=1).int()
     cu_seqlens = F.pad(seqlens.cumsum(0, dtype=torch.int32), (1, 0))
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
+    attention_mask_2d: Optional[torch.Tensor] = None,
     causal: bool = False,
 ) -> torch.Tensor:
     assert FLASH_KERNEL is not None, "Kernel Flash Attention is not available in this environment."
     seq_len: int,
     device: torch.device,
     attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[BlockMask]]:
     """Build padding masks once for all encoder layers.
     Returns (attention_mask_2d, attention_mask_4d, flex_block_mask).
         num_attention_heads: int = 15,
         num_hidden_layers: int = 30,
         num_labels: int = 2,
+        problem_type: Optional[str] = None,
         dropout: float = 0.0,
         initializer_range: float = 0.02,
         attn_backend: str = "sdpa",
     def forward(
         self,
         x: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
         qkv_BLD3 = self.layernorm_qkv(x)
         query_BLD, key_BLD, value_BLD = torch.chunk(qkv_BLD3, 3, dim=-1)
         query_BLD, key_BLD = (
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
         if output_attentions:
             return self._manual_attn(query_BHLD, key_BHLD, value_BHLD, attention_mask_4d, output_s_max)
         return attn_output, attn_weights, s_max
     @torch.no_grad()
+    def _compute_s_max(self, query_BHLD: torch.Tensor, key_BHLD: torch.Tensor) -> List[torch.Tensor]:
         q_norm = torch.linalg.vector_norm(query_BHLD, dim=-1)
         k_norm = torch.linalg.vector_norm(key_BHLD, dim=-1)
         s_max_bound = (q_norm.max(dim=-1).values * k_norm.max(dim=-1).values).max(dim=0).values * self.scale
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_4d: Optional[torch.Tensor] = None,
         output_s_max: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[List[torch.Tensor]]]:
         attn_weights = torch.matmul(query_BHLD, key_BHLD.transpose(-2, -1)) * self.scale
         if attention_mask_4d is not None:
             attn_weights = attn_weights.masked_fill(attention_mask_4d.logical_not(), float("-inf"))
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, None]:
         query_BLHD = query_BHLD.transpose(1, 2).contiguous()
         key_BLHD = key_BHLD.transpose(1, 2).contiguous()
         value_BLHD = value_BHLD.transpose(1, 2).contiguous()
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        flex_block_mask: Optional[BlockMask] = None,
+    ) -> Tuple[torch.Tensor, None]:
         assert flex_attention is not None, "Flex attention is not available in this environment."
         fn = _get_flex_attention_fn()
         context_BHLD = fn(query_BHLD, key_BHLD, value_BHLD, block_mask=flex_block_mask, scale=self.scale)
         query_BHLD: torch.Tensor,
         key_BHLD: torch.Tensor,
         value_BHLD: torch.Tensor,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, None]:
         context_BHLD = F.scaled_dot_product_attention(
             query_BHLD, key_BHLD, value_BHLD, attn_mask=attention_mask_4d, scale=self.scale,
         )
     def forward(
         self,
         x: torch.Tensor,
+        attention_mask_2d: Optional[torch.Tensor] = None,
+        attention_mask_4d: Optional[torch.Tensor] = None,
+        flex_block_mask: Optional[BlockMask] = None,
         output_attentions: bool = False,
         output_s_max: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
         attn_output, attn_weights, s_max = self.attn(
             x,
             attention_mask_2d=attention_mask_2d,
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor]] = None
     attentions: Optional[Tuple[torch.Tensor]] = None
+    s_max: Optional[Tuple[List[torch.Tensor], ...]] = None
 @dataclass
     last_hidden_state: Optional[torch.Tensor] = None
     hidden_states: Optional[Tuple[torch.Tensor]] = None
     attentions: Optional[Tuple[torch.Tensor]] = None
+    s_max: Optional[Tuple[List[torch.Tensor], ...]] = None
 ### Transformer Stack
 def _load_esmc_checkpoint_model(
     config: ESMplusplusConfig,
     model: str,
+    device: Union[torch.device, str] = "cpu",
 ) -> ESMplusplusForMaskedLM:
     key = _resolve_esmc_checkpoint_key(model)
     spec = _ESMC_CHECKPOINT_SPECS[key]
     return model_obj
+def ESMplusplus_300M(device: Union[torch.device, str] = "cpu"):
     config = ESMplusplusConfig(
         hidden_size=960,
         num_attention_heads=15,
     return _load_esmc_checkpoint_model(config=config, model="esmc-300", device=device)
+def ESMplusplus_600M(device: Union[torch.device, str] = "cpu"):
     config = ESMplusplusConfig(
         hidden_size=1152,
         num_attention_heads=18,