RNAErnie / tokenization_rnaernie.py
Taykhoom's picture
Upload folder using huggingface_hub
7b1fe62 verified
import os
from typing import Dict, List, Optional, Tuple
from transformers import PreTrainedTokenizer
_VOCAB = {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"[DEL]": 5,
"[IND]": 6,
"RNaseMRPRNA": 7,
"RNasePRNA": 8,
"SRPRNA": 9,
"YRNA": 10,
"antisenseRNA": 11,
"autocatalyticallysplicedintron": 12,
"guideRNA": 13,
"hammerheadribozyme": 14,
"lncRNA": 15,
"miRNA": 16,
"miscRNA": 17,
"ncRNA": 18,
"other": 19,
"piRNA": 20,
"premiRNA": 21,
"precursorRNA": 22,
"rRNA": 23,
"ribozyme": 24,
"sRNA": 25,
"scRNA": 26,
"scaRNA": 27,
"siRNA": 28,
"snRNA": 29,
"snoRNA": 30,
"tRNA": 31,
"telomeraseRNA": 32,
"tmRNA": 33,
"vaultRNA": 34,
"A": 35,
"T": 36,
"C": 37,
"G": 38,
}
class RNAErnieTokenizer(PreTrainedTokenizer):
"""Character-level RNA tokenizer for RNAErnie (original ERNIE/PaddlePaddle version).
Converts U to T before tokenisation (model was pretrained with DNA-style T).
Input sequences are uppercased and U->T substituted automatically.
Vocabulary (39 tokens):
- Special: [PAD]=0, [UNK]=1, [CLS]=2, [SEP]=3, [MASK]=4, [DEL]=5, [IND]=6
- ncRNA type labels: indices 7-34 (28 labels)
- Nucleotides: A=35, T=36, C=37, G=38
"""
vocab_files_names = {"vocab_file": "vocab.txt"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
pad_token="[PAD]",
unk_token="[UNK]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
**kwargs,
):
if vocab_file and os.path.isfile(vocab_file):
self._vocab = {}
with open(vocab_file, encoding="utf-8") as f:
for idx, line in enumerate(f):
token = line.rstrip("\n")
self._vocab[token] = idx
else:
self._vocab = dict(_VOCAB)
self._ids_to_tokens = {v: k for k, v in self._vocab.items()}
super().__init__(
pad_token=pad_token,
unk_token=unk_token,
cls_token=cls_token,
sep_token=sep_token,
mask_token=mask_token,
**kwargs,
)
@property
def vocab_size(self) -> int:
return len(self._vocab)
def get_vocab(self) -> Dict[str, int]:
return dict(self._vocab)
def _tokenize(self, text: str) -> List[str]:
return list(text.upper().replace("U", "T"))
def _convert_token_to_id(self, token: str) -> int:
return self._vocab.get(token, self._vocab["[UNK]"])
def _convert_id_to_token(self, index: int) -> str:
return self._ids_to_tokens.get(index, "[UNK]")
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
os.makedirs(save_directory, exist_ok=True)
fname = (filename_prefix + "-" if filename_prefix else "") + "vocab.txt"
path = os.path.join(save_directory, fname)
with open(path, "w", encoding="utf-8") as f:
for token, _ in sorted(self._vocab.items(), key=lambda x: x[1]):
f.write(token + "\n")
return (path,)
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
cls = [self.cls_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False) -> List[int]:
if already_has_special_tokens:
return super().get_special_tokens_mask(token_ids_0, token_ids_1, already_has_special_tokens=True)
mask = [1] + [0] * len(token_ids_0) + [1]
if token_ids_1 is not None:
mask += [1] + [0] * len(token_ids_1) + [1]
return mask
def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return [0] * len(cls + token_ids_0 + sep)
return [0] * len(cls + token_ids_0 + sep) + [1] * len(token_ids_1 + sep)