IlPakoZ
/

m5-encoder

@@ -4,7 +4,7 @@
   ],
   "auto_map": {
   "AutoConfig": "modeling_m5_encoder.M5EncoderConfig",
-  "AutoModel": "modeling_m5_encoder.M5ModelForRegression",
   "AutoModelForSequenceClassification": "modeling_m5_encoder.M5ModelForRegression"
   },
   "classifier_dropout": 0,

   ],
   "auto_map": {
   "AutoConfig": "modeling_m5_encoder.M5EncoderConfig",
+  "AutoModel": "modeling_m5_encoder.M5Model",
   "AutoModelForSequenceClassification": "modeling_m5_encoder.M5ModelForRegression"
   },
   "classifier_dropout": 0,

modeling_m5_encoder.py CHANGED Viewed

@@ -54,18 +54,59 @@ class M5EncoderConfig(T5Config):
 class M5Encoder(PreTrainedModel):
     config_class = M5EncoderConfig
     def __init__(self, config):
         super().__init__(config)
         self.model = M5EncoderModel(config)
-        #self.model = torch.compile(self.model, mode="max-autotune", fullgraph=True)
     def forward(self, input_ids, attention_mask=None, relative_position=None, **kwargs):
         return self.model(input_ids=input_ids,
             attention_mask=attention_mask,
             relative_position=relative_position)
-    def get_positional_embeddings_and_align(self, smiles, token_regr, seed):
         return get_positional_encodings_and_align(smiles, token_regr, seed)
 class M5EncoderModel(T5EncoderModel):

 class M5Encoder(PreTrainedModel):
     config_class = M5EncoderConfig
+    base_model_prefix = "encoder"
     def __init__(self, config):
         super().__init__(config)
         self.model = M5EncoderModel(config)
     def forward(self, input_ids, attention_mask=None, relative_position=None, **kwargs):
         return self.model(input_ids=input_ids,
             attention_mask=attention_mask,
             relative_position=relative_position)
+    def get_positional_embeddings_and_align(
+        self,
+        smiles: str,
+        seed: int,
+        token_regr: Optional[np.ndarray] = None,
+    ) -> tuple[str, np.ndarray, Optional[np.ndarray]]:
+        """
+        Convert a SMILES string into a SELFIES tokenization, compute pairwise
+        molecular-graph distance encodings, and optionally align token-level
+        regression labels to the new token order.
+        Args:
+            smiles: Input molecule as a SMILES string. Does not need to be
+                canonical — canonicalization and optional randomization are
+                applied internally.
+            seed: Epoch/seed value controlling SMILES augmentation. When 0,
+                the canonical SELFIES is used; any other value produces a
+                reproducible randomized SELFIES variant.
+            token_regr: Optional array for reproducibility.
+                Array of per-atom regression labels (e.g.
+                Löwdin charges) aligned to the original SMILES atom order.
+                If provided, labels are re-aligned to match the SELFIES token
+                order of the (possibly randomized) output SMILES.
+                Shape: ``(n_atoms,)``.
+        Returns:
+            A tuple of:
+            - **selfies** (``str``): SELFIES encoding of the (possibly
+            randomized) SMILES.
+            - **pos_encod** (``np.ndarray``): Pairwise distance matrix of
+            shape ``(seq_len, seq_len)`` with ``dtype=np.int16``. Entries
+            are shortest-path graph distances between atoms, capped at
+            ``np.iinfo(np.int16).max - 1``. Special values: ``0`` for
+            CLS-to-token, token-to-CLS, and ring/dot-separated fragment
+            pairs; ``-1`` for intra-branch/ring structural tokens;
+            ``np.iinfo(np.int16).max`` for padding positions.
+            - **token_regr_selfies** (``np.ndarray`` or ``None``): Labels
+            re-aligned to SELFIES token positions, shape
+            ``(seq_len - 1,)``, with ``np.nan`` for non-atom tokens
+            (branches, rings, dots). ``None`` if ``token_regr`` was not
+            provided.
+        """
         return get_positional_encodings_and_align(smiles, token_regr, seed)
 class M5EncoderModel(T5EncoderModel):