| | from transformers.tokenization_utils import PreTrainedTokenizer |
| | from transformers.utils import logging |
| | from transformers import AutoTokenizer |
| | from transformers.tokenization_utils_base import BatchEncoding |
| | import torch |
| | import numpy as np |
| | from typing import List, Dict, Optional, Union, Tuple |
| |
|
| | |
| | from transformers.models.auto import AutoTokenizer |
| | from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING |
| | from transformers.models.auto.configuration_auto import CONFIG_MAPPING |
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| | class Evo2Tokenizer(PreTrainedTokenizer): |
| | """ |
| | Tokenizer for Evo2 models - wraps the CharLevelTokenizer to be compatible with HuggingFace. |
| | """ |
| | vocab_files_names = {} |
| | model_input_names = ["input_ids", "attention_mask"] |
| | |
| | def __init__( |
| | self, |
| | evo2_tokenizer, |
| | bos_token="<s>", |
| | eos_token="</s>", |
| | pad_token="<pad>", |
| | unk_token="<unk>", |
| | **kwargs |
| | ): |
| | """ |
| | Initialize the Evo2Tokenizer. |
| | |
| | Args: |
| | evo2_tokenizer: The Evo2 CharLevelTokenizer to wrap |
| | bos_token: Beginning of sequence token |
| | eos_token: End of sequence token |
| | pad_token: Padding token |
| | unk_token: Unknown token |
| | """ |
| | self.evo2_tokenizer = evo2_tokenizer |
| | |
| | |
| | self._pad_token = pad_token |
| | self._eos_token = eos_token |
| | self._bos_token = bos_token |
| | self._unk_token = unk_token |
| | |
| | |
| | super().__init__( |
| | bos_token=bos_token, |
| | eos_token=eos_token, |
| | pad_token=pad_token, |
| | unk_token=unk_token, |
| | **kwargs |
| | ) |
| | |
| | |
| | self.pad_token_id = self.evo2_tokenizer.pad_id |
| | self.eos_token_id = self.evo2_tokenizer.eos_id |
| | |
| | @property |
| | def vocab_size(self) -> int: |
| | """Return the vocab size of the tokenizer.""" |
| | return self.evo2_tokenizer.vocab_size |
| | |
| | def get_vocab(self) -> Dict: |
| | """Return vocab as a dictionary.""" |
| | |
| | |
| | return {chr(i): i for i in range(self.vocab_size)} |
| | |
| | def _tokenize(self, text: str) -> List[int]: |
| | """Tokenize a string using the Evo2 tokenizer.""" |
| | return [chr(int(token)) for token in self.evo2_tokenizer.tokenize(text)] |
| | |
| | def _convert_token_to_id(self, token: str) -> int: |
| | """Convert a token to an id using the Evo2 tokenizer.""" |
| | |
| | return ord(token) |
| | |
| | def _convert_id_to_token(self, index: int) -> str: |
| | """Convert an id to a token using the Evo2 tokenizer.""" |
| | |
| | return chr(index) |
| | |
| | def convert_tokens_to_string(self, tokens: List[str]) -> str: |
| | """Convert a sequence of tokens to a single string.""" |
| | return "".join(tokens) |
| | |
| | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
| | """No vocabulary to save for Evo2Tokenizer, so just return an empty tuple.""" |
| | return () |
| | |
| | def __call__( |
| | self, |
| | text: Union[str, List[str]], |
| | text_pair: Optional[Union[str, List[str]]] = None, |
| | padding: Union[bool, str] = False, |
| | truncation: Union[bool, str] = False, |
| | max_length: Optional[int] = None, |
| | return_tensors: Optional[str] = None, |
| | return_token_type_ids: Optional[bool] = None, |
| | return_attention_mask: Optional[bool] = True, |
| | **kwargs |
| | ) -> Dict[str, torch.Tensor]: |
| | """ |
| | Main tokenization method that handles batching and converts to tensors. |
| | """ |
| | |
| | if isinstance(text, str): |
| | text = [text] |
| | |
| | |
| | input_ids_list = [] |
| | for seq in text: |
| | |
| | tokens = [int(token) for token in self.evo2_tokenizer.tokenize(seq)] |
| | |
| | |
| | if truncation and max_length and len(tokens) > max_length: |
| | tokens = tokens[:max_length] |
| | |
| | input_ids_list.append(tokens) |
| | |
| | |
| | if padding: |
| | if False: |
| | max_len = max_length |
| | else: |
| | max_len = max(len(ids) for ids in input_ids_list) |
| | |
| | |
| | padded_input_ids = [] |
| | attention_mask = [] |
| | |
| | for ids in input_ids_list: |
| | |
| | padding_length = max_len - len(ids) |
| | padded_ids = [self.pad_token_id] * padding_length + ids |
| | mask = [0] * padding_length + [1] * len(ids) |
| | |
| | padded_input_ids.append(padded_ids) |
| | attention_mask.append(mask) |
| | |
| | input_ids_list = padded_input_ids |
| | else: |
| | |
| | attention_mask = [[1] * len(ids) for ids in input_ids_list] |
| | |
| | |
| | result = {"input_ids": input_ids_list} |
| | if return_attention_mask: |
| | result["attention_mask"] = attention_mask |
| | |
| | |
| | if return_tensors == "pt": |
| | result = {k: torch.tensor(v) for k, v in result.items()} |
| | |
| | |
| | return BatchEncoding( |
| | data=result, |
| | tensor_type=return_tensors, |
| | prepend_batch_axis=False, |
| | encoding=None |
| | ) |
| | |
| | def batch_decode( |
| | self, |
| | sequences: Union[List[int], List[List[int]], torch.Tensor], |
| | skip_special_tokens: bool = False, |
| | **kwargs |
| | ) -> List[str]: |
| | """ |
| | Decode a batch of token ids to strings. |
| | """ |
| | if isinstance(sequences, torch.Tensor): |
| | sequences = sequences.tolist() |
| | |
| | return self.evo2_tokenizer.detokenize_batch(sequences) |
| | |
| | def decode( |
| | self, |
| | token_ids: Union[int, List[int], torch.Tensor], |
| | skip_special_tokens: bool = False, |
| | **kwargs |
| | ) -> str: |
| | """ |
| | Decode a single sequence of token ids to a string. |
| | """ |
| | if isinstance(token_ids, torch.Tensor): |
| | token_ids = token_ids.tolist() |
| | |
| | |
| | if not isinstance(token_ids, list) or not token_ids or not isinstance(token_ids[0], (list, torch.Tensor)): |
| | return self.evo2_tokenizer.detokenize(token_ids) |
| | |
| | |
| | return self.batch_decode(token_ids, skip_special_tokens, **kwargs)[0] |
| |
|
| |
|
| | |
| | |
| | def register_evo2_tokenizer(): |
| | """Register the Evo2Tokenizer with HuggingFace's AutoTokenizer.""" |
| | |
| | |
| | AutoTokenizer.register("evo2", Evo2Tokenizer) |
| | |
| | |
| | |
| | |
| | |
| | print("Evo2Tokenizer registered with AutoTokenizer") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | register_evo2_tokenizer() |