decodon-200M-euk / tokenization_decodon.py

Upload tokenizer

89ce8d5 verified 7 months ago

12.7 kB

	import json
	import os
	import re
	from transformers import PreTrainedTokenizer
	from itertools import product

	class DeCodonTokenizer(PreTrainedTokenizer):
	"""
	DeCodonTokenizer Tokenizer: tokenize 3-mer codons into tokens
	The input sequences are expected to be raw sequences of coding DNA/RNA sequences.
	"""

	SUPPORTED_TYPES = ["dna", "rna"]

	@staticmethod
	def get_all_codons(seq_type="dna"):
	"""
	Get all possible codons.
	"""
	seq_type = seq_type.lower()
	assert (
	seq_type in DeCodonTokenizer.SUPPORTED_TYPES
	), f"seq_type should be either 'dna' or 'rna'. Got {seq_type}!"

	if seq_type == "dna":
	return ["".join(codon) for codon in product("ACGT", repeat=3)]
	else:
	return ["".join(codon) for codon in product("ACGU", repeat=3)]

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, inputs, *kwargs):
	"""
	Instantiate a DeCodonTokenizer from a pre-trained tokenizer.
	"""
	# Handle the case where we're loading from a local directory
	if os.path.isdir(pretrained_model_name_or_path):
	vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
	if os.path.exists(vocab_file):
	kwargs["vocab_file"] = vocab_file
	else:
	# For hub loading, try to get the vocab file from the cached download
	from transformers.utils import cached_file
	try:
	vocab_file = cached_file(pretrained_model_name_or_path, "vocab.json")
	if vocab_file:
	kwargs["vocab_file"] = vocab_file
	except Exception:
	# If vocab.json is not found, continue without it (use default vocab)
	pass

	# Create instance with the vocab_file parameter
	return cls(inputs, *kwargs)

	def __init__(
	self,
	vocab_file=None,
	cls_token="<CLS>",
	bos_token="<CLS>",
	sep_token="<SEP>",
	unk_token="<UNK>",
	pad_token="<PAD>",
	mask_token="<MASK>",
	seq_type="dna",
	**kwargs,
	):
	self.codons = self.get_all_codons(seq_type=seq_type)
	self.seq_type = seq_type
	self.special_tokens = [cls_token, sep_token, unk_token, pad_token, mask_token]
	self.special_tokens = [str(token) for token in self.special_tokens]

	if vocab_file is not None:
	import json
	with open(vocab_file, "r") as f:
	self.encoder = json.load(f)
	self.decoder = {i: k for k, i in self.encoder.items()}

	self.compiled_regex = re.compile(
	"\|".join(list(self.encoder.keys()) + [r"\S"])
	)
	else:
	self.encoder = {k: i for i, k in enumerate(self.special_tokens + self.codons)}
	self.decoder = {i: k for k, i in self.encoder.items()}

	self.compiled_regex = re.compile(
	"\|".join(self.codons + self.special_tokens + [r"\S"])
	)

	super().__init__(
	cls_token=cls_token,
	bos_token=bos_token,
	sep_token=sep_token,
	unk_token=unk_token,
	pad_token=pad_token,
	mask_token=mask_token,
	**kwargs,
	)

	self.aa_to_codon = {
	"A": ["GCT", "GCC", "GCA", "GCG"],
	"C": ["TGT", "TGC"],
	"D": ["GAT", "GAC"],
	"E": ["GAA", "GAG"],
	"F": ["TTT", "TTC"],
	"G": ["GGT", "GGC", "GGA", "GGG"],
	"H": ["CAT", "CAC"],
	"I": ["ATT", "ATC", "ATA"],
	"K": ["AAA", "AAG"],
	"L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
	"M": ["ATG"],
	"N": ["AAT", "AAC"],
	"P": ["CCT", "CCC", "CCA", "CCG"],
	"Q": ["CAA", "CAG"],
	"R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
	"S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
	"T": ["ACT", "ACC", "ACA", "ACG"],
	"V": ["GTT", "GTC", "GTA", "GTG"],
	"W": ["TGG"],
	"Y": ["TAT", "TAC"],
	"*": ["TAA", "TAG", "TGA"],
	}
	self.codon_to_aa = {
	codon: aa for aa, codons in self.aa_to_codon.items() for codon in codons
	}

	if seq_type == "rna":
	self.aa_to_codon = {
	k: [c.replace("T", "U") for c in v] for k, v in self.aa_to_codon.items()
	}
	self.codon_to_aa = {
	k.replace("T", "U"): v for k, v in self.codon_to_aa.items()
	}

	self.amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
	self.encoder_aa = {
	k: i for i, k in enumerate(self.special_tokens + self.amino_acids)
	}
	self.compiled_regex_aa = re.compile(
	"\|".join(self.amino_acids + self.special_tokens + [r"\S"])
	)

	self.token_type_mode = kwargs.get("token_type_mode", "regular")
	self.build_token_type_encoder()

	def set_organism_tokens(self, organism_tokens):
	"""
	Add organism tokens to the tokenizer.
	"""
	vocab_size = len(self.encoder)
	for i, token in enumerate(organism_tokens):
	self.encoder[token] = vocab_size + i
	self.decoder[vocab_size + i] = token

	self.organism_tokens = organism_tokens
	self.compiled_regex = re.compile(
	"\|".join(self.codons + self.special_tokens + organism_tokens + [r"\S"])
	)

	@property
	def vocab_size(self):
	return len(self.encoder)

	def build_token_type_encoder(self):
	if self.token_type_mode == "aa":
	# build a token type encoder for amino acids with codon ids as keys and amino acid ids as values
	# CLS, SEP, UNK, MASK, PAD tokens are assigned to the same token type as zero
	token_type_encoder = {}
	for token, token_id in self.encoder.items():
	if token in self.special_tokens:
	token_type_encoder[token_id] = 0
	elif token in self.codons:
	aa = self.codon_to_aa[token]
	token_type_encoder[token_id] = (
	list(self.amino_acids + ["*"]).index(aa) + 1
	)
	else:
	token_type_encoder[token_id] = len(self.amino_acids) + 2
	elif self.token_type_mode == "regular":
	# build a token type encoder for regular tokens
	token_type_encoder = {token_id: 0 for token_id in self.encoder.values()}
	elif self.token_type_mode == "regular_special":
	# build a token type encoder for regular tokens with special tokens having a different but same token type
	token_type_encoder = {
	token_id: 0 if token in self.special_tokens else 1
	for token, token_id in self.encoder.items()
	}
	else:
	raise ValueError(f"Unknown token type mode: {self.token_type_mode}")

	self.token_type_encoder = token_type_encoder

	@property
	def token_type_vocab_size(self):
	return len(set(self.token_type_encoder.values())) + 1

	def get_vocab(self):
	return dict(self.encoder, **self.added_tokens_encoder)

	def _tokenize(self, text):
	"""
	Tokenize a string.
	"""
	text = text.upper()
	tokens = self.compiled_regex.findall(text)
	return tokens

	def _convert_token_to_id(self, token):
	"""
	Converts a token (str) in an id using the vocab.
	"""
	return self.encoder.get(token, self.encoder[self.unk_token])

	def _convert_id_to_token(self, index):
	"""
	Converts an index (integer) in a token (str) using the vocab.
	"""
	return self.decoder.get(index, self.unk_token)

	def convert_tokens_to_string(self, tokens):
	"""
	Converts a sequence of tokens (string) in a single string.
	"""
	return "".join(tokens)

	def encode_aa(self, text):
	"""
	Encode a DNA/RNA string using the amino acid vocab.
	"""
	tokens = self._tokenize(text)
	return [
	self.encoder_aa.get(token, self.encoder_aa[self.unk_token])
	for token in tokens
	]

	def get_aa_vocab_size(self):
	return len(self.encoder_aa)

	def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
	adding special tokens.

	This implementation does not add special tokens and this method should be overridden in a subclass.

	Args:
	token_ids_0 (`List[int]`): The first tokenized sequence.
	token_ids_1 (`List[int]`, optional): The second tokenized sequence.

	Returns:
	`List[int]`: The model input with special tokens.
	"""
	token_ids_0 = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
	return token_ids_0

	def get_special_tokens_mask(
	self, token_ids_0, token_ids_1=None, already_has_special_tokens: bool = False
	):
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

	Args:
	token_ids_0 (`List[int]`):
	List of ids of the first sequence.
	token_ids_1 (`List[int]`, optional):
	List of ids of the second sequence.
	already_has_special_tokens (`bool`, optional, defaults to `False`):
	Whether or not the token list is already formatted with special tokens for the model.

	Returns:
	A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""
	special_ids = [
	self.eos_token_id,
	self.pad_token_id,
	self.mask_token_id,
	self.sep_token_id,
	self.cls_token_id,
	]

	if already_has_special_tokens:
	special_tokens_mask = [
	1 if idx in special_ids else 0 for idx in token_ids_0
	]
	else:
	special_tokens_mask = (
	[1] + [1 if idx in special_ids else 0 for idx in token_ids_0] + [1]
	)

	return special_tokens_mask

	def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
	"""
	Create the token type IDs corresponding to the sequences passed. [What are token type
	IDs?](../glossary#token-type-ids)

	Should be overridden in a subclass if the model has a special way of building those.

	Args:
	token_ids_0 (`List[int]`): The first tokenized sequence.
	token_ids_1 (`List[int]`, optional): The second tokenized sequence.

	Returns:
	`List[int]`: The token type ids.
	"""
	# special_ids = [
	# self.bos_token_id,
	# self.eos_token_id,
	# self.pad_token_id,
	# self.mask_token_id,
	# self.cls_token_id,
	# self.sep_token_id,
	# ]

	# token_type_ids = [0] + [0 for idx in token_ids_0] + [0]

	unk_type_id = len(set(self.token_type_encoder.values()))

	token_type_ids = [
	self.token_type_encoder.get(token_id, unk_type_id)
	for token_id in token_ids_0
	]

	return token_type_ids

	def save_vocabulary(self, save_directory, filename_prefix=None):
	"""
	Save only the vocabulary of the tokenizer (vocabulary + added tokens).

	This method won't save the configuration and special token mappings of the tokenizer. Use
	[`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.

	Args:
	save_directory (`str`):
	The directory in which to save the vocabulary.
	filename_prefix (`str`, optional):
	An optional prefix to add to the named of the saved files.

	Returns:
	`Tuple(str)`: Paths to the files saved.
	"""
	if filename_prefix is None:
	filename_prefix = ""

	vocab_file = os.path.join(save_directory, filename_prefix + "vocab.json")

	with open(vocab_file, "w") as f:
	json.dump(self.encoder, f)

	return (vocab_file,)