Upload tokenization_syncodonlm.py

Browse files

Files changed (1) hide show

tokenization_syncodonlm.py +39 -0

tokenization_syncodonlm.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from transformers import PreTrainedTokenizer
+import json
+import os
+class SynCodonLMTokenizer(PreTrainedTokenizer):
+    def __init__(self, vocab_file, unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]",
+                 sep_token="[SEP]", mask_token="[MASK]", **kwargs):
+        super().__init__(unk_token=unk_token, pad_token=pad_token,
+                         cls_token=cls_token, sep_token=sep_token,
+                         mask_token=mask_token, **kwargs)
+        self.vocab = self._load_vocab(vocab_file)
+        self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
+        self.tokens_to_ids = self.vocab
+        self.vocab_file = vocab_file
+    def _load_vocab(self, vocab_file):
+        with open(vocab_file, "r") as f:
+            return json.load(f)
+    def _tokenize(self, text):
+        # Codon splitting logic
+        text = text.upper().replace("U", "T")
+        codons = [text[i:i+3] for i in range(0, len(text), 3)]
+        return codons
+    def convert_tokens_to_ids(self, tokens):
+        return [self.tokens_to_ids.get(token, self.tokens_to_ids.get(self.unk_token)) for token in tokens]
+    def convert_ids_to_tokens(self, ids):
+        return [self.ids_to_tokens.get(i, self.unk_token) for i in ids]
+    def get_vocab(self):
+        return self.vocab
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        vocab_path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json")
+        with open(vocab_path, "w") as f:
+            json.dump(self.vocab, f)
+        return (vocab_path,)