gbyuvd
/

ChemQ3MTP-base

Text Generation

molecular-generation

cheminformatics

Model card Files Files and versions

gbyuvd commited on Oct 2, 2025

Commit

02f786b

·

verified ·

1 Parent(s): 4f7732b

Upload FastChemTokenizerHF.py

Files changed (1) hide show

FastChemTokenizerHF.py +41 -0

FastChemTokenizerHF.py CHANGED Viewed

@@ -463,6 +463,47 @@ class FastChemTokenizer(PreTrainedTokenizerBase):
             token = self.id_to_token.get(tid, self.unk_token)
             tid_str = "None" if tid is None else f"{tid:5d}"
             print(f"  [{i:03d}] ID={tid_str} → '{token}'")
     # ------------------------------
     # Save / Load

             token = self.id_to_token.get(tid, self.unk_token)
             tid_str = "None" if tid is None else f"{tid:5d}"
             print(f"  [{i:03d}] ID={tid_str} → '{token}'")
+    def pad(
+        self,
+        encoded_inputs,
+        padding=True,
+        max_length=None,
+        pad_to_multiple_of=None,
+        return_tensors=None,
+        **kwargs,
+    ):
+        """
+        HuggingFace-style pad. Takes a list/dict of encoded inputs and pads them.
+        """
+        if isinstance(encoded_inputs, dict):
+            encoded_inputs = [encoded_inputs]
+        input_ids = [ei["input_ids"] for ei in encoded_inputs]
+        attn_masks = [ei.get("attention_mask", [1]*len(ei["input_ids"])) for ei in encoded_inputs]
+        # determine pad length
+        max_len = max(len(ids) for ids in input_ids)
+        if pad_to_multiple_of:
+            max_len = ((max_len + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
+        if max_length is not None:
+            max_len = min(max_len, max_length)
+        padded_ids, padded_masks = [], []
+        for ids, mask in zip(input_ids, attn_masks):
+            pad_len = max_len - len(ids)
+            if self.padding_side == "right":
+                padded_ids.append(ids + [self.pad_token_id] * pad_len)
+                padded_masks.append(mask + [0] * pad_len)
+            else:
+                padded_ids.append([self.pad_token_id] * pad_len + ids)
+                padded_masks.append([0] * pad_len + mask)
+        out = {"input_ids": padded_ids, "attention_mask": padded_masks}
+        if return_tensors in ["pt", "torch"]:
+            out = {k: torch.tensor(v, dtype=torch.long) for k, v in out.items()}
+        return out
     # ------------------------------
     # Save / Load