AIDO.RNA-650M / tokenization_aidorna.py
Taykhoom's picture
Upload folder using huggingface_hub
a28a33f verified
import os
from typing import List, Optional
from transformers import PreTrainedTokenizer
_DEFAULT_VOCAB = [
"[PAD]", "[MASK]", "[CLS]", "[SEP]", "[UNK]",
"A", "G", "C", "T", "U", "N",
"[BOS]", "[EOS]", "[UNUSED1]", "[UNUSED2]", "[UNUSED3]",
]
class AIDORNATokenizer(PreTrainedTokenizer):
vocab_files_names = {"vocab_file": "vocab.txt"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
unk_token="[UNK]",
cls_token="[CLS]",
pad_token="[PAD]",
mask_token="[MASK]",
sep_token="[SEP]",
bos_token="[BOS]",
eos_token="[EOS]",
**kwargs,
):
if vocab_file is not None and os.path.isfile(vocab_file):
with open(vocab_file) as f:
self.all_tokens = [line.strip() for line in f if line.strip()]
else:
self.all_tokens = list(_DEFAULT_VOCAB)
self._id_to_token = dict(enumerate(self.all_tokens))
self._token_to_id = {tok: idx for idx, tok in enumerate(self.all_tokens)}
super().__init__(
unk_token=unk_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
sep_token=sep_token,
bos_token=bos_token,
eos_token=eos_token,
**kwargs,
)
# Register all vocab tokens as no-split so the trie-based tokenizer matches
# single characters (A, G, C, T, U, N) and special tokens exactly.
self.unique_no_split_tokens = self.all_tokens
self._update_trie(self.unique_no_split_tokens)
@property
def vocab_size(self):
return len(self.all_tokens)
def get_vocab(self):
vocab = dict(self._token_to_id)
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text, **kwargs):
return text.split()
def _convert_token_to_id(self, token):
return self._token_to_id.get(token, self._token_to_id.get("[UNK]", 4))
def _convert_id_to_token(self, index):
return self._id_to_token.get(index, "[UNK]")
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
cls = [self.cls_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + cls + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
if already_has_special_tokens:
return [1 if t in self.all_special_ids else 0 for t in token_ids_0]
mask = [1] + [0] * len(token_ids_0) + [1]
if token_ids_1 is not None:
mask += [1] + [0] * len(token_ids_1) + [1]
return mask
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return [0] * (1 + len(token_ids_0) + 1)
return [0] * (1 + len(token_ids_0) + 1) + [1] * (1 + len(token_ids_1) + 1)
def save_vocabulary(self, save_directory, filename_prefix=None):
os.makedirs(save_directory, exist_ok=True)
fname = (filename_prefix + "-" if filename_prefix else "") + "vocab.txt"
path = os.path.join(save_directory, fname)
with open(path, "w") as f:
f.write("\n".join(self.all_tokens))
return (path,)