Synthyra
/

FastESM2_650

@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from typing import Optional, Tuple, Union
 from einops import rearrange
 from transformers import PreTrainedModel, PretrainedConfig
@@ -20,11 +21,11 @@ from transformers.models.esm.modeling_esm import (
     EsmClassificationHead,
     create_position_ids_from_input_ids,
 )
 class FastEsmConfig(PretrainedConfig):
     model_type = "fast_esm"
     def __init__(
         self,
         vocab_size=None,
@@ -141,14 +142,6 @@ class EsmEmbeddings(nn.Module):
             "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
         )
-        self.padding_idx = config.pad_token_id
-        self.position_embeddings = nn.Embedding(
-            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
-        )
-        # Token dropout does not work correctly so we disable it
-        # self.token_dropout = config.token_dropout
-        self.mask_token_id = config.mask_token_id
     def forward(
         self, input_ids=None, attention_mask=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
     ):
@@ -164,10 +157,6 @@ class EsmEmbeddings(nn.Module):
         embeddings = inputs_embeds
-        if self.position_embedding_type == "absolute":
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings = embeddings + position_embeddings
         if self.layer_norm is not None:
             embeddings = self.layer_norm(embeddings)
         if attention_mask is not None:
@@ -336,6 +325,19 @@ class EsmEncoder(nn.Module):
         )
 class FastEsmPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -364,6 +366,120 @@ class FastEsmPreTrainedModel(PreTrainedModel):
         except AttributeError:
             return self.esm.embeddings.word_embeddings
 class FastEsmModel(FastEsmPreTrainedModel):
     def __init__(self, config, add_pooling_layer=True):

 import torch
 import torch.nn as nn
 from torch.nn import functional as F
+from torch.utils.data import Dataset, DataLoader
 from typing import Optional, Tuple, Union
 from einops import rearrange
 from transformers import PreTrainedModel, PretrainedConfig
     EsmClassificationHead,
     create_position_ids_from_input_ids,
 )
+from tqdm.auto import tqdm
 class FastEsmConfig(PretrainedConfig):
     model_type = "fast_esm"
     def __init__(
         self,
         vocab_size=None,
             "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
         )
     def forward(
         self, input_ids=None, attention_mask=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
     ):
         embeddings = inputs_embeds
         if self.layer_norm is not None:
             embeddings = self.layer_norm(embeddings)
         if attention_mask is not None:
         )
+### Dataset for Embedding
+class ProteinDataset(Dataset):
+    """Simple dataset for protein sequences."""
+    def __init__(self, sequences: list[str]):
+        self.sequences = sequences
+    def __len__(self) -> int:
+        return len(self.sequences)
+    def __getitem__(self, idx: int) -> str:
+        return self.sequences[idx]
 class FastEsmPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
         except AttributeError:
             return self.esm.embeddings.word_embeddings
+    @property
+    def device(self) -> torch.device:
+        """Get the device of the model."""
+        return next(self.parameters()).device
+    def mean_pooling(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Apply mean pooling to sequence outputs."""
+        if attention_mask is None:
+            return x.mean(dim=1)
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (x * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
+    def _collate_fn(self, sequences: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
+        """Collate function for batching sequences."""
+        return self.tokenizer(sequences, return_tensors="pt", padding='longest', pad_to_multiple_of=8)
+    def _read_sequences_from_db(self, db_path: str) -> set[str]:
+        """Read sequences from SQLite database."""
+        import sqlite3
+        sequences = []
+        with sqlite3.connect(db_path) as conn:
+            c = conn.cursor()
+            c.execute("SELECT sequence FROM embeddings")
+            while True:
+                row = c.fetchone()
+                if row is None:
+                    break
+                sequences.append(row[0])
+        return set(sequences)
+    def embed_dataset(
+        self,
+        sequences: list[str],
+        batch_size: int = 2,
+        max_len: int = 512,
+        full_embeddings: bool = False,
+        full_precision: bool = False,
+        pooling_type: str = 'mean',
+        num_workers: int = 0,
+        sql: bool = False,
+        sql_db_path: str = 'embeddings.db',
+    ) -> Optional[dict[str, torch.Tensor]]:
+        """Embed a dataset of protein sequences.
+        Args:
+            sequences: List of protein sequences
+            batch_size: Batch size for processing
+            max_len: Maximum sequence length
+            full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
+            full_precision: Whether to cast to full precision (float32) before storage - relevant for dict storage
+            pooling_type: Type of pooling ('mean' or 'cls')
+            num_workers: Number of workers for data loading, 0 for the main process
+            sql: Whether to store embeddings in SQLite database - will be stored in float32
+            sql_db_path: Path to SQLite database
+        Returns:
+            Dictionary mapping sequences to embeddings, or None if sql=True
+        """
+        sequences = list(set([seq[:max_len] for seq in sequences]))
+        sequences = sorted(sequences, key=len, reverse=True)
+        dataset = ProteinDataset(sequences)
+        dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=self._collate_fn)
+        device = self.device
+        def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+            if full_embeddings:
+                return residue_embeddings
+            elif pooling_type == 'mean':
+                return self.mean_pooling(residue_embeddings, attention_mask)
+            else:
+                return residue_embeddings[:, 0, :]
+        if sql:
+            import sqlite3
+            conn = sqlite3.connect(sql_db_path)
+            c = conn.cursor()
+            c.execute('CREATE TABLE IF NOT EXISTS embeddings (sequence text PRIMARY KEY, embedding blob)')
+            already_embedded = self._read_sequences_from_db(sql_db_path)
+            to_embed = [seq for seq in sequences if seq not in already_embedded]
+            print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
+            print(f"Embedding {len(to_embed)} new sequences")
+            with torch.no_grad():
+                for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
+                    seqs = sequences[i * batch_size:(i + 1) * batch_size]
+                    input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
+                    residue_embeddings = self.forward(input_ids, attention_mask, output_hidden_states=True).hidden_states[-1].float() # required for sql
+                    embeddings = get_embeddings(residue_embeddings, attention_mask)
+                    for seq, emb in zip(seqs, embeddings):
+                        c.execute("INSERT OR REPLACE INTO embeddings VALUES (?, ?)",
+                                (seq, emb.cpu().numpy().tobytes()))
+                    if (i + 1) % 100 == 0:
+                        conn.commit()
+            conn.commit()
+            conn.close()
+            return None
+        embeddings_dict = {}
+        with torch.no_grad():
+            for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
+                seqs = sequences[i * batch_size:(i + 1) * batch_size]
+                input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
+                residue_embeddings = self.forward(input_ids, attention_mask, output_hidden_states=True).hidden_states[-1].float()
+                if full_precision:
+                    residue_embeddings = residue_embeddings.float()
+                embeddings = get_embeddings(residue_embeddings, attention_mask)
+                for seq, emb in zip(seqs, embeddings):
+                    embeddings_dict[seq] = emb
+        return embeddings_dict
 class FastEsmModel(FastEsmPreTrainedModel):
     def __init__(self, config, add_pooling_layer=True):