File size: 1,456 Bytes
e6d6d14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# tokenizer.py
from transformers import PreTrainedTokenizer
from itertools import product
import json
import os
class CodonTokenizer(PreTrainedTokenizer):
def __init__(self, **kwargs):
bases = ['A', 'T', 'G', 'C']
codons = [''.join(p) for p in product(bases, repeat=3)]
special_tokens = ['[PAD]', '[BOS]', '[EOS]']
self.vocab_list = special_tokens + codons
self.codon2id = {token: idx for idx, token in enumerate(self.vocab_list)}
self.id2codon = {idx: token for token, idx in self.codon2id.items()}
kwargs['bos_token'] = '[BOS]'
kwargs['eos_token'] = '[EOS]'
kwargs['pad_token'] = '[PAD]'
super().__init__(**kwargs)
def _tokenize(self, text):
return [text[i:i+3] for i in range(0, len(text), 3)]
def _convert_token_to_id(self, token):
return self.codon2id.get(token, self.codon2id['[PAD]'])
def _convert_id_to_token(self, idx):
return self.id2codon.get(idx, '[PAD]')
def convert_tokens_to_string(self, tokens):
return ''.join(tokens)
def get_vocab(self):
return self.codon2id
@property
def vocab_size(self):
return len(self.codon2id)
def save_vocabulary(self, save_directory, filename_prefix=None):
path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json")
with open(path, "w") as f:
json.dump(self.codon2id, f)
return (path,) |