import os from typing import List, Optional from transformers import PreTrainedTokenizer _DEFAULT_VOCAB = [ "[PAD]", "[MASK]", "[CLS]", "[SEP]", "[UNK]", "A", "G", "C", "T", "U", "N", "[BOS]", "[EOS]", "[UNUSED1]", "[UNUSED2]", "[UNUSED3]", ] class AIDORNATokenizer(PreTrainedTokenizer): vocab_files_names = {"vocab_file": "vocab.txt"} model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file=None, unk_token="[UNK]", cls_token="[CLS]", pad_token="[PAD]", mask_token="[MASK]", sep_token="[SEP]", bos_token="[BOS]", eos_token="[EOS]", **kwargs, ): if vocab_file is not None and os.path.isfile(vocab_file): with open(vocab_file) as f: self.all_tokens = [line.strip() for line in f if line.strip()] else: self.all_tokens = list(_DEFAULT_VOCAB) self._id_to_token = dict(enumerate(self.all_tokens)) self._token_to_id = {tok: idx for idx, tok in enumerate(self.all_tokens)} super().__init__( unk_token=unk_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, sep_token=sep_token, bos_token=bos_token, eos_token=eos_token, **kwargs, ) # Register all vocab tokens as no-split so the trie-based tokenizer matches # single characters (A, G, C, T, U, N) and special tokens exactly. self.unique_no_split_tokens = self.all_tokens self._update_trie(self.unique_no_split_tokens) @property def vocab_size(self): return len(self.all_tokens) def get_vocab(self): vocab = dict(self._token_to_id) vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text, **kwargs): return text.split() def _convert_token_to_id(self, token): return self._token_to_id.get(token, self._token_to_id.get("[UNK]", 4)) def _convert_id_to_token(self, index): return self._id_to_token.get(index, "[UNK]") def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): cls = [self.cls_token_id] sep = [self.sep_token_id] if token_ids_1 is None: return cls + token_ids_0 + sep return cls + token_ids_0 + sep + cls + token_ids_1 + sep def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): if already_has_special_tokens: return [1 if t in self.all_special_ids else 0 for t in token_ids_0] mask = [1] + [0] * len(token_ids_0) + [1] if token_ids_1 is not None: mask += [1] + [0] * len(token_ids_1) + [1] return mask def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return [0] * (1 + len(token_ids_0) + 1) return [0] * (1 + len(token_ids_0) + 1) + [1] * (1 + len(token_ids_1) + 1) def save_vocabulary(self, save_directory, filename_prefix=None): os.makedirs(save_directory, exist_ok=True) fname = (filename_prefix + "-" if filename_prefix else "") + "vocab.txt" path = os.path.join(save_directory, fname) with open(path, "w") as f: f.write("\n".join(self.all_tokens)) return (path,)