decodon-200M-euk / tokenization_decodon.py
mohsennp's picture
Upload tokenizer
89ce8d5 verified
import json
import os
import re
from transformers import PreTrainedTokenizer
from itertools import product
class DeCodonTokenizer(PreTrainedTokenizer):
"""
DeCodonTokenizer Tokenizer: tokenize 3-mer codons into tokens
The input sequences are expected to be raw sequences of coding DNA/RNA sequences.
"""
SUPPORTED_TYPES = ["dna", "rna"]
@staticmethod
def get_all_codons(seq_type="dna"):
"""
Get all possible codons.
"""
seq_type = seq_type.lower()
assert (
seq_type in DeCodonTokenizer.SUPPORTED_TYPES
), f"seq_type should be either 'dna' or 'rna'. Got {seq_type}!"
if seq_type == "dna":
return ["".join(codon) for codon in product("ACGT", repeat=3)]
else:
return ["".join(codon) for codon in product("ACGU", repeat=3)]
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
"""
Instantiate a DeCodonTokenizer from a pre-trained tokenizer.
"""
# Handle the case where we're loading from a local directory
if os.path.isdir(pretrained_model_name_or_path):
vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
if os.path.exists(vocab_file):
kwargs["vocab_file"] = vocab_file
else:
# For hub loading, try to get the vocab file from the cached download
from transformers.utils import cached_file
try:
vocab_file = cached_file(pretrained_model_name_or_path, "vocab.json")
if vocab_file:
kwargs["vocab_file"] = vocab_file
except Exception:
# If vocab.json is not found, continue without it (use default vocab)
pass
# Create instance with the vocab_file parameter
return cls(*inputs, **kwargs)
def __init__(
self,
vocab_file=None,
cls_token="<CLS>",
bos_token="<CLS>",
sep_token="<SEP>",
unk_token="<UNK>",
pad_token="<PAD>",
mask_token="<MASK>",
seq_type="dna",
**kwargs,
):
self.codons = self.get_all_codons(seq_type=seq_type)
self.seq_type = seq_type
self.special_tokens = [cls_token, sep_token, unk_token, pad_token, mask_token]
self.special_tokens = [str(token) for token in self.special_tokens]
if vocab_file is not None:
import json
with open(vocab_file, "r") as f:
self.encoder = json.load(f)
self.decoder = {i: k for k, i in self.encoder.items()}
self.compiled_regex = re.compile(
"|".join(list(self.encoder.keys()) + [r"\S"])
)
else:
self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
self.decoder = {i: k for k, i in self.encoder.items()}
self.compiled_regex = re.compile(
"|".join(self.codons + self.special_tokens + [r"\S"])
)
super().__init__(
cls_token=cls_token,
bos_token=bos_token,
sep_token=sep_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs,
)
self.aa_to_codon = {
"A": ["GCT", "GCC", "GCA", "GCG"],
"C": ["TGT", "TGC"],
"D": ["GAT", "GAC"],
"E": ["GAA", "GAG"],
"F": ["TTT", "TTC"],
"G": ["GGT", "GGC", "GGA", "GGG"],
"H": ["CAT", "CAC"],
"I": ["ATT", "ATC", "ATA"],
"K": ["AAA", "AAG"],
"L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
"M": ["ATG"],
"N": ["AAT", "AAC"],
"P": ["CCT", "CCC", "CCA", "CCG"],
"Q": ["CAA", "CAG"],
"R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
"S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
"T": ["ACT", "ACC", "ACA", "ACG"],
"V": ["GTT", "GTC", "GTA", "GTG"],
"W": ["TGG"],
"Y": ["TAT", "TAC"],
"*": ["TAA", "TAG", "TGA"],
}
self.codon_to_aa = {
codon: aa for aa, codons in self.aa_to_codon.items() for codon in codons
}
if seq_type == "rna":
self.aa_to_codon = {
k: [c.replace("T", "U") for c in v] for k, v in self.aa_to_codon.items()
}
self.codon_to_aa = {
k.replace("T", "U"): v for k, v in self.codon_to_aa.items()
}
self.amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
self.encoder_aa = {
k: i for i, k in enumerate(self.special_tokens + self.amino_acids)
}
self.compiled_regex_aa = re.compile(
"|".join(self.amino_acids + self.special_tokens + [r"\S"])
)
self.token_type_mode = kwargs.get("token_type_mode", "regular")
self.build_token_type_encoder()
def set_organism_tokens(self, organism_tokens):
"""
Add organism tokens to the tokenizer.
"""
vocab_size = len(self.encoder)
for i, token in enumerate(organism_tokens):
self.encoder[token] = vocab_size + i
self.decoder[vocab_size + i] = token
self.organism_tokens = organism_tokens
self.compiled_regex = re.compile(
"|".join(self.codons + self.special_tokens + organism_tokens + [r"\S"])
)
@property
def vocab_size(self):
return len(self.encoder)
def build_token_type_encoder(self):
if self.token_type_mode == "aa":
# build a token type encoder for amino acids with codon ids as keys and amino acid ids as values
# CLS, SEP, UNK, MASK, PAD tokens are assigned to the same token type as zero
token_type_encoder = {}
for token, token_id in self.encoder.items():
if token in self.special_tokens:
token_type_encoder[token_id] = 0
elif token in self.codons:
aa = self.codon_to_aa[token]
token_type_encoder[token_id] = (
list(self.amino_acids + ["*"]).index(aa) + 1
)
else:
token_type_encoder[token_id] = len(self.amino_acids) + 2
elif self.token_type_mode == "regular":
# build a token type encoder for regular tokens
token_type_encoder = {token_id: 0 for token_id in self.encoder.values()}
elif self.token_type_mode == "regular_special":
# build a token type encoder for regular tokens with special tokens having a different but same token type
token_type_encoder = {
token_id: 0 if token in self.special_tokens else 1
for token, token_id in self.encoder.items()
}
else:
raise ValueError(f"Unknown token type mode: {self.token_type_mode}")
self.token_type_encoder = token_type_encoder
@property
def token_type_vocab_size(self):
return len(set(self.token_type_encoder.values())) + 1
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def _tokenize(self, text):
"""
Tokenize a string.
"""
text = text.upper()
tokens = self.compiled_regex.findall(text)
return tokens
def _convert_token_to_id(self, token):
"""
Converts a token (str) in an id using the vocab.
"""
return self.encoder.get(token, self.encoder[self.unk_token])
def _convert_id_to_token(self, index):
"""
Converts an index (integer) in a token (str) using the vocab.
"""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""
Converts a sequence of tokens (string) in a single string.
"""
return "".join(tokens)
def encode_aa(self, text):
"""
Encode a DNA/RNA string using the amino acid vocab.
"""
tokens = self._tokenize(text)
return [
self.encoder_aa.get(token, self.encoder_aa[self.unk_token])
for token in tokens
]
def get_aa_vocab_size(self):
return len(self.encoder_aa)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens.
This implementation does not add special tokens and this method should be overridden in a subclass.
Args:
token_ids_0 (`List[int]`): The first tokenized sequence.
token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Returns:
`List[int]`: The model input with special tokens.
"""
token_ids_0 = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
return token_ids_0
def get_special_tokens_mask(
self, token_ids_0, token_ids_1=None, already_has_special_tokens: bool = False
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of ids of the first sequence.
token_ids_1 (`List[int]`, *optional*):
List of ids of the second sequence.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
special_ids = [
self.eos_token_id,
self.pad_token_id,
self.mask_token_id,
self.sep_token_id,
self.cls_token_id,
]
if already_has_special_tokens:
special_tokens_mask = [
1 if idx in special_ids else 0 for idx in token_ids_0
]
else:
special_tokens_mask = (
[1] + [1 if idx in special_ids else 0 for idx in token_ids_0] + [1]
)
return special_tokens_mask
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""
Create the token type IDs corresponding to the sequences passed. [What are token type
IDs?](../glossary#token-type-ids)
Should be overridden in a subclass if the model has a special way of building those.
Args:
token_ids_0 (`List[int]`): The first tokenized sequence.
token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
Returns:
`List[int]`: The token type ids.
"""
# special_ids = [
# self.bos_token_id,
# self.eos_token_id,
# self.pad_token_id,
# self.mask_token_id,
# self.cls_token_id,
# self.sep_token_id,
# ]
# token_type_ids = [0] + [0 for idx in token_ids_0] + [0]
unk_type_id = len(set(self.token_type_encoder.values()))
token_type_ids = [
self.token_type_encoder.get(token_id, unk_type_id)
for token_id in token_ids_0
]
return token_type_ids
def save_vocabulary(self, save_directory, filename_prefix=None):
"""
Save only the vocabulary of the tokenizer (vocabulary + added tokens).
This method won't save the configuration and special token mappings of the tokenizer. Use
[`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
filename_prefix (`str`, *optional*):
An optional prefix to add to the named of the saved files.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if filename_prefix is None:
filename_prefix = ""
vocab_file = os.path.join(save_directory, filename_prefix + "vocab.json")
with open(vocab_file, "w") as f:
json.dump(self.encoder, f)
return (vocab_file,)