PlantGenoANN / tokenization_caduceus.py

Rename PlantGenoANN/tokenization_caduceus.py to tokenization_caduceus.py

ac6c905 verified 12 days ago

4.04 kB

	from transformers import PreTrainedTokenizer, AddedToken
	from typing import List, Optional, Union, Dict, Sequence, Tuple
	from pathlib import Path
	import json
	import os

	# Copied from HyenaDNATokenizer
	class CaduceusTokenizer(PreTrainedTokenizer):
	model_input_names = ["input_ids"]

	def __init__(self,
	model_max_length: int,
	bos_token="[BOS]",
	eos_token="[SEP]",
	sep_token="[SEP]",
	cls_token="[CLS]",
	pad_token="[PAD]",
	mask_token="[MASK]",
	unk_token="[UNK]",
	**kwargs):
	"""Character tokenizer for Hugging Face transformers.
	Args:
	characters (Sequence[str]): List of desired characters. Any character which
	is not included in this list will be replaced by a special token called
	[UNK] with id=6. Following are list of all of the special tokens with
	their corresponding ids:
	"[CLS]": 0
	"[SEP]": 1
	"[BOS]": 2
	"[MASK]": 3
	"[PAD]": 4
	"[RESERVED]": 5
	"[UNK]": 6
	an id (starting at 7) will be assigned to each character.
	model_max_length (int): Model maximum sequence length.
	"""
	self.characters = ('A', 'C', 'G', 'T', 'N')
	self.model_max_length = model_max_length

	self._vocab_str_to_int = {
	"[CLS]": 0,
	"[SEP]": 1,
	"[BOS]": 2,
	"[MASK]": 3,
	"[PAD]": 4,
	"[RESERVED]": 5,
	"[UNK]": 6,
	**{ch: i + 7 for i, ch in enumerate(self.characters)},
	}
	self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
	add_prefix_space = kwargs.pop("add_prefix_space", False)
	padding_side = kwargs.pop("padding_side", "left")

	super().__init__(
	bos_token=bos_token,
	eos_token=eos_token,
	sep_token=sep_token,
	cls_token=cls_token,
	pad_token=pad_token,
	mask_token=mask_token,
	unk_token=unk_token,
	add_prefix_space=add_prefix_space,
	model_max_length=model_max_length,
	padding_side=padding_side,
	**kwargs,
	)

	@property
	def vocab_size(self) -> int:
	return len(self._vocab_str_to_int)

	def _tokenize(self, text: str) -> List[str]:
	return list(text)

	def _convert_token_to_id(self, token: str) -> int:
	return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])

	def _convert_id_to_token(self, index: int) -> str:
	return self._vocab_int_to_str[index]

	def convert_tokens_to_string(self, tokens):
	return "".join(tokens)

	def get_special_tokens_mask(
	self,
	token_ids_0: List[int],
	token_ids_1: Optional[List[int]] = None,
	already_has_special_tokens: bool = False,
	) -> List[int]:
	if already_has_special_tokens:
	return super().get_special_tokens_mask(
	token_ids_0=token_ids_0,
	token_ids_1=token_ids_1,
	already_has_special_tokens=True,
	)

	result = ([0] * len(token_ids_0)) + [1]
	if token_ids_1 is not None:
	result += ([0] * len(token_ids_1)) + [1]
	return result

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	cls = [self.cls_token_id]
	sep = [self.sep_token_id]
	result = cls + token_ids_0 + sep
	if token_ids_1 is not None:
	result += token_ids_1 + sep
	return result

	def get_vocab(self) -> Dict[str, int]:
	return self._vocab_str_to_int

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
	return ()