IlPakoZ
/

m5-encoder

@@ -40,6 +40,33 @@ model = AutoModelForSequenceClassification.from_pretrained(
 )
 ```
 ## Architecture
@@ -114,7 +141,7 @@ The processed dataset contains **82,686,706 SMILES sequences**, each paired with
 | Split | Sequences | Tokens (approx.) |
 |---|---|---|
 | Train | 66,149,364 | ~2.5 B (×2 with augmentation → ~5 B) |
-| Validation | 8,268,673 | — |
 | Test | 8,268,669 | ~ 0.82 B (×2 with augmentation → ~1.64 B) |
 Training augmentation generates randomized SELFIES on the fly from each SMILES. Labels are normalized before training.

 )
 ```
+### Preparing inputs
+Inputs require SELFIES tokenization **and** a precomputed distance matrix
+(`relative_position`). Use the helper bundled in the repo:
+```python
+tokenizer = AutoTokenizer.from_pretrained("IlPakoZ/m5-encoder", trust_remote_code=True)
+smiles = "CCO"
+# seed = 0 produces the canonical SELFIES, other values generate random reproducible variations
+selfies, pos_encod, _ = model.get_positional_encodings_and_align(smiles, seed=0)
+encoding    = tokenizer(selfies, return_tensors="pt")
+input_ids   = encoding["input_ids"]
+attn_mask   = encoding["attention_mask"]
+rel_pos     = torch.tensor(pos_encod).unsqueeze(0)   # (1, seq_len, seq_len)
+outputs = model(input_ids=input_ids, attention_mask=attn_mask, relative_position=rel_pos)
+hidden  = outputs.last_hidden_state   # (1, seq_len, 512)
+```
+A function ``model.collate_for_dataset`` is also available to perform collation for use in Pytorch's DataLoader. The function gets a list of tuples, each of which is composed of:
+- the first element is a dictionary with keys ``"input_ids"`` (``np.ndarray``, shape ``(L,)``) and ``"attention_mask"`` (``np.ndarray``, shape ``(L,)``), as produced by a tokenizer
+- the second element contains the positional embedding matrix;
+- (optional) token regression labels. This is maintained mostly for reproducibility of our paper's results, but it can be left to None in most circumstances.
 ## Architecture
 | Split | Sequences | Tokens (approx.) |
 |---|---|---|
 | Train | 66,149,364 | ~2.5 B (×2 with augmentation → ~5 B) |
+| Validation | 8,268,673 | tbd |
 | Test | 8,268,669 | ~ 0.82 B (×2 with augmentation → ~1.64 B) |
 Training augmentation generates randomized SELFIES on the fly from each SMILES. Labels are normalized before training.

modeling_m5_encoder.py CHANGED Viewed

@@ -3,7 +3,7 @@ import numpy as np
 import math
 import logging
-from typing import Optional, Union
 import torch.nn as nn
 from transformers import PreTrainedModel, T5EncoderModel, T5ForConditionalGeneration, T5ForQuestionAnswering, T5ForTokenClassification, T5Model, load_tf_weights_in_t5
 from torch import nn
@@ -64,9 +64,9 @@ class M5Encoder(PreTrainedModel):
         return self.model(input_ids=input_ids,
             attention_mask=attention_mask,
             relative_position=relative_position)
     def get_positional_encodings_and_align(
-        self,
         smiles: str,
         seed: int,
         token_regr: Optional[np.ndarray] = None,
@@ -107,7 +107,107 @@ class M5Encoder(PreTrainedModel):
             (branches, rings, dots). ``None`` if ``token_regr`` was not
             provided.
         """
         return get_positional_encodings_and_align(smiles, token_regr, seed)
 class M5EncoderModel(T5EncoderModel):
     def __init__(self, config: T5Config):
@@ -161,6 +261,7 @@ class M5EncoderModel(T5EncoderModel):
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 inputs_embeds=inputs_embeds,
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,

 import math
 import logging
+from typing import Any, Optional, Union, Sequence
 import torch.nn as nn
 from transformers import PreTrainedModel, T5EncoderModel, T5ForConditionalGeneration, T5ForQuestionAnswering, T5ForTokenClassification, T5Model, load_tf_weights_in_t5
 from torch import nn
         return self.model(input_ids=input_ids,
             attention_mask=attention_mask,
             relative_position=relative_position)
+    @staticmethod
     def get_positional_encodings_and_align(
         smiles: str,
         seed: int,
         token_regr: Optional[np.ndarray] = None,
             (branches, rings, dots). ``None`` if ``token_regr`` was not
             provided.
         """
         return get_positional_encodings_and_align(smiles, token_regr, seed)
+    @staticmethod
+    def collate_for_dataset(batch: list[dict[str, Any]], n_global_regr: int = 0, PAD_TOKEN_ID: int = 2):
+        """
+        Collate processed data for pytorch dataloaders.
+        Each item in ``batch`` is a 3-tuple ``(token_dict, pos_encod, reg)``
+        where:
+        - ``token_dict`` is a dict with keys ``"input_ids"`` (``np.ndarray``,
+          shape ``(L,)``) and ``"attention_mask"`` (``np.ndarray``, shape
+          ``(L,)``), as produced by a tokenizer.
+        - ``pos_encod`` is an ``np.ndarray`` of shape ``(L, L)`` and dtype
+          ``np.int16`` holding pairwise molecular-graph distances, as returned
+          by :meth:`get_positional_encodings_and_align`.
+        - ``reg`` is an ``np.ndarray`` of shape
+          ``(n_global_regr + L - 1,)`` containing first the
+          ``n_global_regr`` sequence-level regression targets followed by
+          ``L - 1`` token-level targets (one per non-CLS token). Ignored when
+          ``n_global_regr == 0``.
+        All sequences are right-padded to the length of the longest sequence
+        in the batch (``L_max``):
+        - ``input_ids`` is padded with ``PAD_TOKEN_ID``.
+        - ``attention_mask`` is padded with ``0``.
+        - ``pos_encod`` is padded with ``np.iinfo(np.int16).max``; the
+          diagonal of the padded region is set to ``0`` to be consistent with
+          real token self-distances.
+        - ``labels`` (when present) is padded with ``float("nan")`` so that
+          padding positions can be masked out in the loss.
+        Args:
+            batch: List of ``(token_dict, pos_encod, reg)`` tuples, one per
+                sample.
+            n_global_regr: Number of sequence-level regression targets at the
+                start of each ``reg`` array. When ``0``, no ``"labels"`` key
+                is included in the returned dict.
+            PAD_TOKEN_ID: Token id used to fill padded positions in
+                ``input_ids``. Defaults to ``2``.
+        Returns:
+            A dict with the following keys:
+            - ``"input_ids"`` — ``torch.LongTensor`` of shape
+              ``(B, L_max)``.
+            - ``"attention_mask"`` — ``torch.LongTensor`` of shape
+              ``(B, L_max)``; ``1`` for real tokens, ``0`` for padding.
+            - ``"positional_encodings"`` — ``torch.ShortTensor`` of shape
+              ``(B, L_max, L_max)``.
+            - ``"labels"`` *(only when* ``n_global_regr > 0`` *)* —
+              ``torch.FloatTensor`` of shape
+              ``(B, n_global_regr + L_max - 1)``; ``nan`` for padding
+              positions.
+        """
+        token_dicts, pos_encod, regs = zip(*batch)
+        lengths = [td["input_ids"].shape[0] for td in token_dicts]
+        L_max = max(lengths)
+        B = len(batch)
+        input_ids_out   = np.full((B, L_max), PAD_TOKEN_ID,                      dtype=np.int64)
+        attn_mask_out   = np.zeros((B, L_max),                                   dtype=np.int64)
+        pos_encod_out   = np.full((B, L_max, L_max), np.iinfo(np.int16).max,     dtype=np.int16)
+        if n_global_regr > 0:
+            reg_out = np.full((B, n_global_regr + L_max - 1), float("nan"), dtype=np.float32)
+        # Set diagonal to 0 up-front for the full L_max grid; individual items
+        # already have their diagonal zeroed — this covers the padded extension.
+        diag_idx = np.arange(L_max)
+        pos_encod_out[:, diag_idx, diag_idx] = 0
+        for i, (td, pe, reg) in enumerate(zip(token_dicts, pos_encod, regs)):
+            L = lengths[i]
+            # Token ids & attention mask
+            input_ids_out[i, :L]  = td["input_ids"]
+            attn_mask_out[i, :L]  = td["attention_mask"]
+            # Positional embedding (L x L)
+            pos_encod_out[i, :L, :L] = pe
+            # Regression: global part + token part (length L - 1, excluding CLS)
+            if n_global_regr > 0:
+                reg_out[i, :n_global_regr]                      = reg[:n_global_regr]
+                reg_out[i, n_global_regr:n_global_regr + L - 1] = reg[n_global_regr:]
+        out = {
+            "input_ids": torch.from_numpy(input_ids_out),
+            "attention_mask": torch.from_numpy(attn_mask_out),
+            "positional_encodings": torch.from_numpy(pos_encod_out),
+        }
+        if n_global_regr > 0:
+            out["labels"] = torch.from_numpy(reg_out)
+        return out
 class M5EncoderModel(T5EncoderModel):
     def __init__(self, config: T5Config):
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 inputs_embeds=inputs_embeds,
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,