|
|
|
|
|
|
|
|
from transformers import PreTrainedTokenizer |
|
|
from itertools import product |
|
|
import json |
|
|
import os |
|
|
|
|
|
class CodonTokenizer(PreTrainedTokenizer): |
|
|
def __init__(self, **kwargs): |
|
|
bases = ['A', 'T', 'G', 'C'] |
|
|
codons = [''.join(p) for p in product(bases, repeat=3)] |
|
|
special_tokens = ['[PAD]', '[BOS]', '[EOS]'] |
|
|
self.vocab_list = special_tokens + codons |
|
|
|
|
|
self.codon2id = {token: idx for idx, token in enumerate(self.vocab_list)} |
|
|
self.id2codon = {idx: token for token, idx in self.codon2id.items()} |
|
|
|
|
|
kwargs['bos_token'] = '[BOS]' |
|
|
kwargs['eos_token'] = '[EOS]' |
|
|
kwargs['pad_token'] = '[PAD]' |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
def _tokenize(self, text): |
|
|
return [text[i:i+3] for i in range(0, len(text), 3)] |
|
|
|
|
|
def _convert_token_to_id(self, token): |
|
|
return self.codon2id.get(token, self.codon2id['[PAD]']) |
|
|
|
|
|
def _convert_id_to_token(self, idx): |
|
|
return self.id2codon.get(idx, '[PAD]') |
|
|
|
|
|
def convert_tokens_to_string(self, tokens): |
|
|
return ''.join(tokens) |
|
|
|
|
|
def get_vocab(self): |
|
|
return self.codon2id |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return len(self.codon2id) |
|
|
|
|
|
def save_vocabulary(self, save_directory, filename_prefix=None): |
|
|
path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json") |
|
|
with open(path, "w") as f: |
|
|
json.dump(self.codon2id, f) |
|
|
return (path,) |