# tokenizer.py from transformers import PreTrainedTokenizer from itertools import product import json import os class CodonTokenizer(PreTrainedTokenizer): def __init__(self, **kwargs): bases = ['A', 'T', 'G', 'C'] codons = [''.join(p) for p in product(bases, repeat=3)] special_tokens = ['[PAD]', '[BOS]', '[EOS]'] self.vocab_list = special_tokens + codons self.codon2id = {token: idx for idx, token in enumerate(self.vocab_list)} self.id2codon = {idx: token for token, idx in self.codon2id.items()} kwargs['bos_token'] = '[BOS]' kwargs['eos_token'] = '[EOS]' kwargs['pad_token'] = '[PAD]' super().__init__(**kwargs) def _tokenize(self, text): return [text[i:i+3] for i in range(0, len(text), 3)] def _convert_token_to_id(self, token): return self.codon2id.get(token, self.codon2id['[PAD]']) def _convert_id_to_token(self, idx): return self.id2codon.get(idx, '[PAD]') def convert_tokens_to_string(self, tokens): return ''.join(tokens) def get_vocab(self): return self.codon2id @property def vocab_size(self): return len(self.codon2id) def save_vocabulary(self, save_directory, filename_prefix=None): path = os.path.join(save_directory, (filename_prefix or "") + "vocab.json") with open(path, "w") as f: json.dump(self.codon2id, f) return (path,)