jheuschkel commited on
Commit
08d3be6
·
verified ·
1 Parent(s): a548f72

Upload tokenization_syncodonlm.py

Browse files
Files changed (1) hide show
  1. tokenization_syncodonlm.py +39 -0
tokenization_syncodonlm.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer
2
+ import json
3
+ import os
4
+
5
+ class SynCodonLMTokenizer(PreTrainedTokenizer):
6
+ def __init__(self, vocab_file, unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]",
7
+ sep_token="[SEP]", mask_token="[MASK]", **kwargs):
8
+ super().__init__(unk_token=unk_token, pad_token=pad_token,
9
+ cls_token=cls_token, sep_token=sep_token,
10
+ mask_token=mask_token, **kwargs)
11
+ self.vocab = self._load_vocab(vocab_file)
12
+ self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
13
+ self.tokens_to_ids = self.vocab
14
+ self.vocab_file = vocab_file
15
+
16
+ def _load_vocab(self, vocab_file):
17
+ with open(vocab_file, "r") as f:
18
+ return json.load(f)
19
+
20
+ def _tokenize(self, text):
21
+ # Codon splitting logic
22
+ text = text.upper().replace("U", "T")
23
+ codons = [text[i:i+3] for i in range(0, len(text), 3)]
24
+ return codons
25
+
26
+ def convert_tokens_to_ids(self, tokens):
27
+ return [self.tokens_to_ids.get(token, self.tokens_to_ids.get(self.unk_token)) for token in tokens]
28
+
29
+ def convert_ids_to_tokens(self, ids):
30
+ return [self.ids_to_tokens.get(i, self.unk_token) for i in ids]
31
+
32
+ def get_vocab(self):
33
+ return self.vocab
34
+
35
+ def save_vocabulary(self, save_directory, filename_prefix=None):
36
+ vocab_path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json")
37
+ with open(vocab_path, "w") as f:
38
+ json.dump(self.vocab, f)
39
+ return (vocab_path,)