Synthyra
/

FastESM2_650

@@ -3,7 +3,7 @@ import torch.nn as nn
 from torch.nn import functional as F
 from typing import Optional, Tuple, Union
 from einops import rearrange
-from transformers import PreTrainedModel
 from transformers.modeling_outputs import (
     MaskedLMOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -12,8 +12,6 @@ from transformers.modeling_outputs import (
     TokenClassifierOutput
 )
 from transformers.models.esm.modeling_esm import (
-    RotaryEmbedding,
-    EsmContactPredictionHead,
     EsmIntermediate,
     EsmOutput,
     EsmPooler,
@@ -22,7 +20,108 @@ from transformers.models.esm.modeling_esm import (
     EsmClassificationHead,
     create_position_ids_from_input_ids,
 )
-from .config_fastesm import FastEsmConfig
 class EsmEmbeddings(nn.Module):
@@ -134,6 +233,10 @@ class EsmSelfAttention(nn.Module):
         if self.position_embedding_type == "rotary":
             query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
         context_layer = F.scaled_dot_product_attention(
             query_layer,
             key_layer,
@@ -501,7 +604,7 @@ class FastEsmForTokenClassification(FastEsmPreTrainedModel):
 if __name__ == "__main__":
     """
     Test the hidden state differences between the FastEsmModel and the HF EsmModel.
-    In full precision, the differences are very small, but nonzero due to floating point issues with F.scaled_dot_product_attention.
     In Pytorch 2.5+ (and linux kernel), this implementation is very fast and uses less memory than the HF implementation.
     """
     import random
@@ -526,8 +629,9 @@ if __name__ == "__main__":
     for model_path in model_paths:
         print(f"Testing {model_path}...")
         tokenizer = EsmTokenizer.from_pretrained(model_path)
-        fast_model = FastEsmModel.from_pretrained(model_path, token_dropout=False).to(device)
-        model = TransformersEsmModel.from_pretrained(model_path, token_dropout=False).to(device)
         counts = [0] * len(tolerances)
         for _ in range(seq_count):

 from torch.nn import functional as F
 from typing import Optional, Tuple, Union
 from einops import rearrange
+from transformers import PreTrainedModel, PretrainedConfig
 from transformers.modeling_outputs import (
     MaskedLMOutput,
     BaseModelOutputWithPastAndCrossAttentions,
     TokenClassifierOutput
 )
 from transformers.models.esm.modeling_esm import (
     EsmIntermediate,
     EsmOutput,
     EsmPooler,
     EsmClassificationHead,
     create_position_ids_from_input_ids,
 )
+class FastEsmConfig(PretrainedConfig):
+    model_type = "fast_esm"
+    def __init__(
+        self,
+        vocab_size=None,
+        mask_token_id=None,
+        pad_token_id=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=1026,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        position_embedding_type="absolute",
+        emb_layer_norm_before=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.emb_layer_norm_before = emb_layer_norm_before
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = super().to_dict()
+        return output
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin):
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    Rotary position embeddings based on those in
+    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
+    matrices which depend on their relative positions.
+    """
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        inv_freq = inv_freq
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=2):
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
+            self._seq_len_cached = seq_len
+            t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)
+        return (
+            apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
+            apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
+        )
 class EsmEmbeddings(nn.Module):
         if self.position_embedding_type == "rotary":
             query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
+        # Ensure all tensors have the same dtype before calling scaled_dot_product_attention
+        #query_layer = query_layer.to(value_layer.dtype)
+        #key_layer = key_layer.to(value_layer.dtype)
         context_layer = F.scaled_dot_product_attention(
             query_layer,
             key_layer,
 if __name__ == "__main__":
     """
     Test the hidden state differences between the FastEsmModel and the HF EsmModel.
+    In full precision, the differences are very very small, but nonzero due to floating point issues with F.scaled_dot_product_attention.
     In Pytorch 2.5+ (and linux kernel), this implementation is very fast and uses less memory than the HF implementation.
     """
     import random
     for model_path in model_paths:
         print(f"Testing {model_path}...")
         tokenizer = EsmTokenizer.from_pretrained(model_path)
+        config = FastEsmConfig.from_pretrained(model_path)
+        fast_model = FastEsmModel(config).from_pretrained(model_path, torch_dtype=torch.float16).to(device)
+        model = TransformersEsmModel.from_pretrained(model_path, token_dropout=False, torch_dtype=torch.float16).to(device)
         counts = [0] * len(tolerances)
         for _ in range(seq_count):