# Copyright 2019 HuggingFace Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import functools import inspect import itertools import json import os import re import shutil import tempfile import unittest from collections import OrderedDict from itertools import takewhile from pathlib import Path from typing import TYPE_CHECKING, Any, Union from parameterized import parameterized from transformers import ( AutoTokenizer, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, PreTrainedTokenizerBase, T5Tokenizer, T5TokenizerFast, TokenizersBackend, is_mlx_available, is_torch_available, logging, ) from transformers.testing_utils import ( get_tests_dir, require_jinja, require_tokenizers, require_torch, slow, ) from transformers.tokenization_python import AddedToken from .test_sentencepiece_backend_mixin import SentencePieceBackendTesterMixin from .test_tokenizers_backend_mixin import TokenizersBackendTesterMixin NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"] SMALL_TRAINING_CORPUS = [ ["This is the first sentence.", "This is the second one."], ["This sentence (contains #) over symbols and numbers 12 3.", "But not this one."], ] input_string = """This is a test 😊 I was born in 92000, and this is falsé. 生活的真谛是 Hi Hello Hi Hello Hello hithere The following string should be properly encoded: Hello. But ird and ปี ird ด Hey how are you doing""" # noqa: W293 if is_torch_available(): import torch if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel def use_cache_if_possible(func): @functools.wraps(func) def wrapper(*args, **kwargs): use_cache = kwargs.pop("use_cache", True) underline_func = func if "functools" in str(func): underline_func = func.__wrapped__ if not use_cache: return underline_func(*args, **kwargs) if any(not arg.__hash__ for arg in args): return underline_func(*args, **kwargs) elif any(not kwarg.__hash__ for kwarg in kwargs.values()): return underline_func(*args, **kwargs) cached = func(*args, **kwargs) copied = copy.deepcopy(cached) # Preserve _tokenizer for all tokenizers (Rust tokenizer objects don't deep copy properly) # This was previously only done for CLIP, but it's needed for all TokenizersBackend tokenizers if hasattr(cached, "_tokenizer"): # Restore _tokenizer from original since deep copy may have lost or corrupted it copied._tokenizer = cached._tokenizer if hasattr(copied, "sp_model"): copied.sp_model = cached.sp_model return copied return wrapper logger = logging.get_logger(__name__) NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"] def filter_non_english(_, pretrained_name: str): """Filter all the model for non-english language""" return not any(lang in pretrained_name for lang in NON_ENGLISH_TAGS) def filter_roberta_detectors(_, pretrained_name: str): return "detector" not in pretrained_name def merge_model_tokenizer_mappings( model_mapping: dict["PretrainedConfig", "PreTrainedModel"], tokenizer_mapping: dict["PretrainedConfig", tuple["PreTrainedTokenizer", "TokenizersBackend"]], ) -> dict[ Union["PreTrainedTokenizer", "TokenizersBackend"], tuple["PretrainedConfig", "PreTrainedModel"], ]: configurations = list(model_mapping.keys()) model_tokenizer_mapping = OrderedDict([]) for configuration in configurations: if configuration in model_mapping and configuration in tokenizer_mapping: model = model_mapping[configuration] tokenizer = tokenizer_mapping[configuration][0] tokenizer_fast = tokenizer_mapping[configuration][1] if tokenizer is not None: if configuration.__name__.startswith(tokenizer.__name__.replace("Tokenizer", "")): model_tokenizer_mapping.update({tokenizer: (configuration, model)}) if tokenizer_fast is not None: if configuration.__name__.startswith(tokenizer_fast.__name__.replace("TokenizerFast", "")): model_tokenizer_mapping.update({tokenizer_fast: (configuration, model)}) return model_tokenizer_mapping def check_subword_sampling( tokenizer: PreTrainedTokenizer, text: str | None = None, test_sentencepiece_ignore_case: bool = True, ) -> None: """ Check if the tokenizer generates different results when subword regularization is enabled. Subword regularization augments training data with subword sampling. This has a random component. Args: tokenizer: The tokenizer to check. text: The text to use for the checks. test_sentencepiece_ignore_case: See `TokenizerTesterMixin.test_sentencepiece_ignore_case`. """ text = "This is a test for subword regularization." if text is None else text if test_sentencepiece_ignore_case: text = text.lower() tokens_list = [] for _ in range(5): tokens_list.append(tokenizer.tokenize(text)) # the list of different pairs of tokens_list combinations = itertools.combinations(tokens_list, 2) # check of sampling is done subword_sampling_found = False for combination in combinations: if combination[0] != combination[1]: subword_sampling_found = True unittest.TestCase().assertTrue(subword_sampling_found) # check if converting back to original text works for tokens in tokens_list: if test_sentencepiece_ignore_case: unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower()) else: unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens)) class TokenizersExtractor: """ Extractor implementation for tokenizers library tokenizer.json files. This class extracts vocab and merges from a tokenizer.json file, similar to SentencePieceExtractor for .model files. """ def __init__(self, tokenizer_file: str): """ Initialize the extractor with a tokenizer.json file. Args: tokenizer_file (str): Path to the tokenizer.json file """ with open(tokenizer_file, "r", encoding="utf-8") as f: self.tokenizer_data = json.load(f) if "model" not in self.tokenizer_data: raise ValueError(f"Invalid tokenizer.json file: missing 'model' key in {tokenizer_file}") self.model_data = self.tokenizer_data["model"] self.model_type = self.model_data.get("type", "Unknown") def extract(self) -> tuple[dict[str, int], list[tuple[str, float]], list[tuple[str, str]], list[dict]]: """ Extract vocabulary, scores, merges, and added_tokens from the tokenizer.json file. Returns: tuple containing: - vocab_ids (dict[str, int]): Mapping from token string to token ID - vocab_scores (list[tuple[str, float]]): List of (token, score) tuples. Note: tokenizer.json doesn't store scores, so all scores are 0.0 - merges (list[tuple[str, str]]): List of merge pairs for BPE tokenizers - added_tokens (list[dict]): List of added token dicts with 'id', 'content', 'special', etc. Raises: ValueError: If the tokenizer type is not supported or vocab is missing """ # Extract vocabulary if "vocab" not in self.model_data: raise ValueError(f"Tokenizer model type '{self.model_type}' does not have a 'vocab' field") vocab_field = self.model_data["vocab"] # Support both dict-based (BPE/WordPiece/WordLevel) and list-based (Unigram) vocabs if isinstance(vocab_field, dict): # {token: id} vocab_ids = dict(vocab_field) # tokenizer.json doesn't store scores for these types; default to 0.0 and sort by id vocab_scores = sorted([(token, 0.0) for token in vocab_field.keys()], key=lambda x: vocab_field[x[0]]) elif isinstance(vocab_field, list): # [[token, score], ...] — ids are the list indices vocab_ids = {token: idx for idx, (token, _score) in enumerate(vocab_field)} vocab_scores = [(token, float(score)) for token, score in vocab_field] else: raise ValueError(f"Unsupported vocab type in tokenizer.json: {type(vocab_field)}") # Extract merges (for BPE tokenizers) merges = [] if "merges" in self.model_data: # tokenizer.json can store merges as either: # 1. Lists like ["▁", "t"] # 2. Strings like "▁ t" for merge_item in self.model_data["merges"]: if isinstance(merge_item, list): # Already in list format if len(merge_item) == 2: merges.append((merge_item[0], merge_item[1])) else: logger.warning(f"Invalid merge format (expected 2 items): {merge_item}, skipping") elif isinstance(merge_item, str): # String format - split on first space parts = merge_item.split(" ", 1) if len(parts) == 2: merges.append((parts[0], parts[1])) else: logger.warning(f"Invalid merge format: '{merge_item}', skipping") else: logger.warning(f"Unknown merge type: {type(merge_item)}, skipping") # Extract added_tokens from tokenizer.json # These are tokens that should not be split by the tokenization algorithm added_tokens_list = self.tokenizer_data.get("added_tokens", []) # Convert to decoder-style mapping: id -> token dict added_tokens_decoder = {} for item in added_tokens_list: if not isinstance(item, dict) or "id" not in item: continue token_id = item["id"] token_kwargs = {k: v for k, v in item.items() if k != "id"} try: added_token_obj = AddedToken(**token_kwargs) except Exception: # Fallback: at minimum require content content = token_kwargs.get("content") if content is None: continue added_token_obj = AddedToken(content, special=bool(token_kwargs.get("special", True))) added_tokens_decoder[token_id] = added_token_obj return vocab_ids, vocab_scores, merges, added_tokens_decoder class TokenizerTesterMixin: tokenizer_class = None space_between_special_tokens = False from_pretrained_kwargs = None from_pretrained_filter = None from_pretrained_id = None from_pretrained_vocab_key = "vocab_file" test_seq2seq = True test_tokenizer_from_extractor = True # set to True to test a sentencepiece tokenizer test_sentencepiece = False # set to True to ignore casing when testing a sentencepiece tokenizer # test_sentencepiece must also be set to True test_sentencepiece_ignore_case = False # Integration test data - can be optionally set by subclasses # Default comprehensive test string covering various edge cases integration_test_input_string = """This is a test 😊 I was born in 92000, and this is falsé. 生活的真谛是 Hi Hello Hi Hello Hello hithere The following string should be properly encoded: Hello. But ird and ปี ird ด Hey how are you doing""" # noqa: W293 integration_expected_tokens = None integration_expected_token_ids = None @classmethod def setUpClass(cls) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, tokenizer class, vocab key name) if cls.from_pretrained_id is None: cls.from_pretrained_id = [] elif isinstance(cls.from_pretrained_id, str): cls.from_pretrained_id = [cls.from_pretrained_id] cls.tokenizers_list = [] if cls.tokenizer_class is not None: cls.tokenizers_list = [ ( cls.tokenizer_class, pretrained_id, cls.from_pretrained_kwargs if cls.from_pretrained_kwargs is not None else {}, ) for pretrained_id in cls.from_pretrained_id ] with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: cls._data = f_data.read().replace("\n\n", "\n").strip() cls.tmpdirname = tempfile.mkdtemp() # save the first pretrained tokenizer to tmpdirname for tests to use if cls.from_pretrained_id and cls.tokenizer_class is not None: try: tokenizer = AutoTokenizer.from_pretrained( cls.from_pretrained_id[0], **(cls.from_pretrained_kwargs if cls.from_pretrained_kwargs is not None else {}), ) tokenizer.save_pretrained(cls.tmpdirname) except Exception: pass @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) def get_input_output_texts(self, tokenizer): input_txt = self.get_clean_sequence(tokenizer)[0] return input_txt, input_txt def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> tuple[str, list]: # the length of the tokenizer does not always represent the tokens that it can encode: what if there are holes? toks = [ (i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in set(tokenizer.get_vocab().values()) ] toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks)) toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks)) if max_length is not None and len(toks) > max_length: toks = toks[:max_length] if min_length is not None and len(toks) < min_length and len(toks) > 0: while len(toks) < min_length: toks = toks + toks # toks_str = [t[1] for t in toks] toks_ids = [t[0] for t in toks] # Ensure consistency output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False) if " " not in output_txt and len(toks_ids) > 1: output_txt = ( tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False) + " " + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False) ) if with_prefix_space: output_txt = " " + output_txt output_ids = tokenizer.encode(output_txt, add_special_tokens=False) return output_txt, output_ids def get_tokenizers(self, **kwargs) -> list[PreTrainedTokenizerBase]: """ Returns a list containing a single tokenizer from get_tokenizer(). Subclasses can override this method to return multiple tokenizers for testing. """ return [self.get_tokenizer(**kwargs)] @classmethod def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer: """Get a tokenizer instance from pretrained.""" pretrained_name = pretrained_name or cls.tmpdirname return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_extracted_tokenizer(self, reference_tokenizer=None): """ Build a tokenizer from extracted vocab/merges using TokenizersExtractor. Args: reference_tokenizer: Optional tokenizer to copy special tokens from. If None, uses get_tokenizer(). Returns: Tokenizer built from extracted vocab/merges, or None if extraction fails. """ if reference_tokenizer is None: reference_tokenizer = self.get_tokenizer() tokenizer_json_path = os.path.join(self.tmpdirname, "tokenizer.json") if not os.path.exists(tokenizer_json_path): return None extractor = TokenizersExtractor(tokenizer_json_path) vocab_ids, vocab_scores, merges, added_tokens_decoder = extractor.extract() vocab = vocab_scores if _type := getattr(self.tokenizer_class, "model", None): if _type.__name__ == "BPE" or _type.__name__ == "WordPiece": vocab = vocab_ids # Convert added_tokens list to added_tokens_decoder dict format # This matches the format used by from_pretrained() from tokenizer_config.jso tokenizer_from_extractor = self.tokenizer_class( vocab=vocab, merges=merges, do_lower_case=False, keep_accents=True, added_tokens_decoder=added_tokens_decoder, **(self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}), ) return tokenizer_from_extractor def get_extracted_tokenizer_from_sentencepiece(self, reference_tokenizer=None): """ Build a tokenizer from extracted vocab/merges using SentencePieceExtractor. """ from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor try: sentencepiece_model_path = os.path.join(self.tmpdirname, "tokenizer.model") if not os.path.exists(sentencepiece_model_path): return None extractor = SentencePieceExtractor(sentencepiece_model_path) vocab_ids, vocab_scores, merges = extractor.extract() tokenizer_from_extractor = self.tokenizer_class(vocab=vocab_ids, merges=merges) return tokenizer_from_extractor except (TypeError, Exception): return None def tokenizer_integration_test_util( self, expected_encoding: dict, model_name: str, revision: str | None = None, sequences: list[str] | None = None, decode_kwargs: dict[str, Any] | None = None, padding: bool = True, ): """ Util for integration test. Text is tokenized and then reverted back to text. Both results are then checked. Args: expected_encoding: The expected result of the tokenizer output. model_name: The model name of the tokenizer to load and use. revision: The full git revision number of the model. This is to pin the tokenizer config and to avoid that tests start to fail if the config gets changed upstream. sequences: Can overwrite the texts that are used to check the tokenizer. This is useful if the tokenizer supports non english languages like france. decode_kwargs: Additional args for the ``decode`` function which reverts the tokenized text back to a string. padding: Activates and controls padding of the tokenizer. """ decode_kwargs = {} if decode_kwargs is None else decode_kwargs if sequences is None: sequences = [ "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides " "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural " "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained " "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.", "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly " "conditioning on both left and right context in all layers.", "The quick brown fox jumps over the lazy dog.", ] if self.test_sentencepiece_ignore_case: sequences = [sequence.lower() for sequence in sequences] tokenizer_classes = [self.tokenizer_class] for tokenizer_class in tokenizer_classes: tokenizer = tokenizer_class.from_pretrained( model_name, revision=revision, # to pin the tokenizer version ) encoding = tokenizer(sequences, padding=padding) decoded_sequences = [ tokenizer.decode(seq, skip_special_tokens=True, **decode_kwargs) for seq in encoding["input_ids"] ] encoding_data = encoding.data self.assertDictEqual(encoding_data, expected_encoding) for expected, decoded in zip(sequences, decoded_sequences): if self.test_sentencepiece_ignore_case: expected = expected.lower() self.assertEqual(expected, decoded) def assert_padded_input_match(self, input_r: list, input_p: list, max_length: int, pad_token_id: int): # Ensure we match max_length self.assertEqual(len(input_r), max_length) self.assertEqual(len(input_p), max_length) # Ensure the number of padded tokens is the same padded_tokens_r = list(takewhile(lambda i: i == pad_token_id, reversed(input_r))) padded_tokens_p = list(takewhile(lambda i: i == pad_token_id, reversed(input_p))) self.assertSequenceEqual(padded_tokens_r, padded_tokens_p) def assert_batch_padded_input_match( self, input_r: dict, input_p: dict, max_length: int, pad_token_id: int, model_main_input_name: str = "input_ids", ): for i_r in input_r.values(): ( self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(len(i_r[1]), max_length), ) ( self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(len(i_r[1]), max_length), ) for i_r, i_p in zip(input_r[model_main_input_name], input_p[model_main_input_name]): self.assert_padded_input_match(i_r, i_p, max_length, pad_token_id) for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]): self.assertSequenceEqual(i_r, i_p) @staticmethod def convert_batch_to_list_format(batch_encode_plus_sequences): # Switch from batch_encode_plus format: {'input_ids': [[...], [...]], ...} # to the list of examples/ encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}] return [ {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences} for i in range(len(batch_encode_plus_sequences["input_ids"])) ] # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers. def test_tokenize_special_tokens(self): """Test `tokenize` with special tokens.""" tokenizer = self.get_tokenizer(do_lower_case=True) SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]" SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]" # Both methods should add the token to `_extra_special_tokens` and `added_tokens_decoder` tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True) tokenizer.add_special_tokens({"extra_special_tokens": [SPECIAL_TOKEN_2]}, replace_extra_special_tokens=False) token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1) token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2) self.assertEqual(len(token_1), 1) self.assertEqual(len(token_2), 1) self.assertEqual(token_1[0], SPECIAL_TOKEN_1) # next is failing for almost all the Fast tokenizers now. # self.assertEqual(token_2[0], SPECIAL_TOKEN_2) def test_model_input_names_signature(self): accepted_model_main_input_names = [ "input_ids", # nlp models "input_values", # speech models ] tokenizer = self.get_tokenizer() # first name of model_input_names has to correspond to main model input name # to make sure `tokenizer.pad(...)` works correctly self.assertTrue(tokenizer.model_input_names[0] in accepted_model_main_input_names) def test_tokenizer_store_full_signature(self): signature = inspect.signature(self.tokenizer_class.__init__) tokenizer = self.get_tokenizer() for parameter_name, parameter in signature.parameters.items(): if parameter.default != inspect.Parameter.empty and parameter_name not in [ "vocab_file", "merges_file", "tokenizer_file", "vocab", "merges", "legacy", ]: self.assertIn(parameter_name, tokenizer.init_kwargs) def test_tokenizers_common_properties(self): tokenizer = self.get_tokenizer() attributes_list = [ "bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token", ] for attr in attributes_list: self.assertTrue(hasattr(tokenizer, attr)) self.assertTrue(hasattr(tokenizer, attr + "_id")) self.assertTrue(hasattr(tokenizer, "extra_special_tokens")) self.assertTrue(hasattr(tokenizer, "extra_special_tokens_ids")) attributes_list = [ "model_max_length", "init_inputs", "init_kwargs", ] if not isinstance(tokenizer, TokenizersBackend): attributes_list += [ "added_tokens_encoder", "added_tokens_decoder", ] for attr in attributes_list: self.assertTrue(hasattr(tokenizer, attr)) def test_tokenizers_common_ids_setters(self): tokenizer = self.get_tokenizer() attributes_list = [ "bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token", ] vocab = tokenizer.get_vocab() token_id_to_test_setters = next(iter(vocab.values())) token_to_test_setters = tokenizer.convert_ids_to_tokens(token_id_to_test_setters, skip_special_tokens=False) for attr in attributes_list: setattr(tokenizer, attr + "_id", None) self.assertEqual(getattr(tokenizer, attr), None) self.assertEqual(getattr(tokenizer, attr + "_id"), None) setattr(tokenizer, attr + "_id", token_id_to_test_setters) self.assertEqual(getattr(tokenizer, attr), token_to_test_setters) self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters) setattr(tokenizer, "extra_special_tokens_ids", []) self.assertListEqual(getattr(tokenizer, "extra_special_tokens"), []) self.assertListEqual(getattr(tokenizer, "extra_special_tokens_ids"), []) setattr(tokenizer, "extra_special_tokens_ids", [token_id_to_test_setters]) self.assertListEqual(getattr(tokenizer, "extra_special_tokens"), [token_to_test_setters]) self.assertListEqual(getattr(tokenizer, "extra_special_tokens_ids"), [token_id_to_test_setters]) def test_save_and_load_tokenizer(self): # safety check on max_len default value so we are sure the test works tokenizer = self.get_tokenizer() self.assertNotEqual(tokenizer.model_max_length, 42) # Now let's start the test tokenizer = self.get_tokenizer() # Isolate this from the other tests because we save additional tokens/etc tmpdirname = tempfile.mkdtemp() sample_text = " He is very happy, UNwant\u00e9d,running" before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) before_vocab = tokenizer.get_vocab() tokenizer.save_pretrained(tmpdirname) after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) after_vocab = after_tokenizer.get_vocab() self.assertListEqual(before_tokens, after_tokens) self.assertDictEqual(before_vocab, after_vocab) shutil.rmtree(tmpdirname) tokenizer = self.get_tokenizer(model_max_length=42) # Isolate this from the other tests because we save additional tokens/etc tmpdirname = tempfile.mkdtemp() sample_text = " He is very happy, UNwant\u00e9d,running" tokenizer.add_tokens(["bim", "bambam"]) extra_special_tokens = tokenizer.extra_special_tokens extra_special_tokens.append("new_extra_special_token") tokenizer.add_special_tokens( {"extra_special_tokens": extra_special_tokens}, replace_extra_special_tokens=False ) before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) before_vocab = tokenizer.get_vocab() tokenizer.save_pretrained(tmpdirname) after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) after_vocab = after_tokenizer.get_vocab() self.assertListEqual(before_tokens, after_tokens) self.assertDictEqual(before_vocab, after_vocab) self.assertIn("bim", after_vocab) self.assertIn("bambam", after_vocab) self.assertIn("new_extra_special_token", after_tokenizer.extra_special_tokens) self.assertEqual(after_tokenizer.model_max_length, 42) tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43) self.assertEqual(tokenizer.model_max_length, 43) shutil.rmtree(tmpdirname) # Test that we can also use the non-legacy saving format for fast tokenizers tokenizer = self.get_tokenizer(model_max_length=42) # Isolate this from the other tests because we save additional tokens/etc tmpdirname = tempfile.mkdtemp() sample_text = " He is very happy, UNwant\u00e9d,running" tokenizer.add_tokens(["bim", "bambam"]) extra_special_tokens = tokenizer.extra_special_tokens extra_special_tokens.append("new_extra_special_token") tokenizer.add_special_tokens( {"extra_special_tokens": extra_special_tokens}, replace_extra_special_tokens=False ) before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) before_vocab = tokenizer.get_vocab() tokenizer.save_pretrained(tmpdirname) after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) after_vocab = after_tokenizer.get_vocab() self.assertListEqual(before_tokens, after_tokens) self.assertDictEqual(before_vocab, after_vocab) self.assertIn("bim", after_vocab) self.assertIn("bambam", after_vocab) self.assertIn("new_extra_special_token", after_tokenizer.extra_special_tokens) self.assertEqual(after_tokenizer.model_max_length, 42) tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43) self.assertEqual(tokenizer.model_max_length, 43) shutil.rmtree(tmpdirname) def _run_integration_checks(self, tokenizer, tokenizer_type): # Test 1: Tokens match expected tokens = tokenizer.tokenize(self.integration_test_input_string) self.maxDiff = None self.assertListEqual( tokens, self.integration_expected_tokens, f"Tokenized tokens don't match expected for {tokenizer.__class__.__name__} ({tokenizer_type})", ) # Test 2: IDs from encode match expected (without special tokens) ids_from_encode = tokenizer.encode(self.integration_test_input_string, add_special_tokens=False) self.assertEqual( ids_from_encode, self.integration_expected_token_ids, f"Encoded IDs don't match expected for {tokenizer.__class__.__name__} ({tokenizer_type})", ) # Test 3: Round-trip decode produces expected text (if provided) decoded_text = tokenizer.decode(self.integration_expected_token_ids, clean_up_tokenization_spaces=False) self.assertEqual( decoded_text, self.integration_expected_decoded_text, f"Decoded text doesn't match expected for {tokenizer.__class__.__name__} ({tokenizer_type})", ) def test_integration(self): """ Integration checks for the original tokenizer only. """ # Skip if no integration test data is provided if not hasattr(self, "integration_test_input_string") or self.integration_test_input_string is None: self.skipTest("No integration test input string provided") if not hasattr(self, "integration_expected_tokens") or self.integration_expected_tokens is None: self.skipTest("No integration expected tokens provided") if not hasattr(self, "integration_expected_token_ids") or self.integration_expected_token_ids is None: self.skipTest("No integration expected token IDs provided") if not hasattr(self, "integration_expected_decoded_text") or self.integration_expected_decoded_text is None: self.skipTest("No integration expected decoded text provided") tokenizer_original = self.tokenizer_class.from_pretrained( self.from_pretrained_id[0], do_lower_case=False, keep_accents=True, **(self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}), ) self._run_integration_checks(tokenizer_original, "original") def test_integration_from_extractor(self): """ Integration checks for a tokenizer built via TokenizersExtractor. """ # Skip if tokenizer-from-extractor path is not enabled for this class if not getattr(self, "test_tokenizer_from_extractor", False): self.skipTest("Tokenizer from TokenizersExtractor not enabled for this tokenizer") # Skip if no integration test data is provided if not hasattr(self, "integration_test_input_string") or self.integration_test_input_string is None: self.skipTest("No integration test input string provided") if not hasattr(self, "integration_expected_tokens") or self.integration_expected_tokens is None: self.skipTest("No integration expected tokens provided") if not hasattr(self, "integration_expected_token_ids") or self.integration_expected_token_ids is None: self.skipTest("No integration expected token IDs provided") if not hasattr(self, "integration_expected_decoded_text") or self.integration_expected_decoded_text is None: self.skipTest("No integration expected decoded text provided") tokenizer_original = self.tokenizer_class.from_pretrained( self.from_pretrained_id[0], do_lower_case=False, keep_accents=True, **(self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}), ) tokenizer_from_extractor = self.get_extracted_tokenizer(reference_tokenizer=tokenizer_original) if tokenizer_from_extractor is None: self.fail("No tokenizer from TokenizersExtractor provided") self._run_integration_checks(tokenizer_from_extractor, "from_extractor") def test_internal_consistency(self): tokenizer = self.get_tokenizer() input_text, output_text = self.get_input_output_texts(tokenizer) tokens = tokenizer.tokenize(input_text) ids = tokenizer.convert_tokens_to_ids(tokens) ids_2 = tokenizer.encode(input_text, add_special_tokens=False) self.assertListEqual(ids, ids_2) tokens_2 = tokenizer.convert_ids_to_tokens(ids) self.assertNotEqual(len(tokens_2), 0) text_2 = tokenizer.decode(ids) self.assertIsInstance(text_2, str) self.assertEqual(text_2, output_text) def test_mask_output(self): tokenizer = self.get_tokenizer(do_lower_case=False) seq_0 = "Test this method." seq_1 = "With these inputs." information = tokenizer(seq_0, seq_1, add_special_tokens=True, return_token_type_ids=True) sequences, mask = information["input_ids"], information["token_type_ids"] self.assertEqual(len(sequences), len(mask)) def test_token_type_ids(self): tokenizer = self.get_tokenizer() seq_0 = "Test this method." # We want to have sequence 0 and sequence 1 are tagged # respectively with 0 and 1 token_ids # (regardless of whether the model use token type ids) # We use this assumption in the QA pipeline among other place output = tokenizer(seq_0, return_token_type_ids=True) self.assertIn(0, output["token_type_ids"]) def test_sequence_ids(self): tokenizer = self.get_tokenizer() if tokenizer.backend != "tokenizers": self.skipTest(reason="Tokenizers backend tokenizer") seq_0 = "Test this method." seq_1 = "With these inputs." # We want to have sequence 0 and sequence 1 are tagged # respectively with 0 and 1 token_ids\ # (regardless of whether the model use token type ids) # We use this assumption in the QA pipeline among other place output = tokenizer(seq_0) self.assertIn(0, output.sequence_ids()) output = tokenizer(seq_0, seq_1) self.assertIn(0, output.sequence_ids()) self.assertIn(1, output.sequence_ids()) if tokenizer.num_special_tokens_to_add(pair=True): self.assertIn(None, output.sequence_ids()) @require_jinja def test_chat_template(self): dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}" dummy_conversation = [ {"role": "system", "content": "system message"}, {"role": "user", "content": "user message"}, {"role": "assistant", "content": "assistant message"}, ] expected_output = "systemsystem messageuseruser messageassistantassistant message" tokenizer = self.get_tokenizer() output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=False, return_dict=False ) self.assertEqual(output, expected_output) # Test we can pass chat_template arg # Check that no error raised when tokenize=True output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=True, return_dict=False ) dict_output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=True, # This also checks return_dict=True is the default ) self.assertEqual(dict_output["input_ids"], output) # Test return_dict behaviour matches tokenizer.chat_template = dummy_template self.assertEqual(tokenizer.chat_template, dummy_template) # Test property setter output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) self.assertEqual(output, expected_output) # Test chat_template attribute is used if no arg is passed # Check that no error raised tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) with tempfile.TemporaryDirectory() as tmp_dir_name: save_files = tokenizer.save_pretrained(tmp_dir_name, save_jinja_files=False) # Check we aren't saving a chat_template.jinja file self.assertFalse(any(file.endswith("chat_template.jinja") for file in save_files)) new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) self.assertEqual(new_tokenizer.chat_template, dummy_template) # Test template has persisted output = new_tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) self.assertEqual(output, expected_output) # Test output is the same after reloading # Check that no error raised new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) with tempfile.TemporaryDirectory() as tmp_dir_name: save_files = tokenizer.save_pretrained(tmp_dir_name) # Check we are saving a chat_template.jinja file self.assertTrue(any(file.endswith("chat_template.jinja") for file in save_files)) chat_template_file = Path(tmp_dir_name) / "chat_template.jinja" self.assertTrue(chat_template_file.is_file()) self.assertEqual(chat_template_file.read_text(), dummy_template) config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text()) # Assert the chat template is not in the config when it's saved as a separate file self.assertNotIn("chat_template", config_dict) new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) self.assertEqual(new_tokenizer.chat_template, dummy_template) # Test template has persisted output = new_tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) self.assertEqual(output, expected_output) # Test output is the same after reloading # Check that no error raised new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) @require_jinja def test_chat_template_save_loading(self): tokenizer = self.get_tokenizer() signature = inspect.signature(tokenizer.__init__) if "chat_template" not in {*signature.parameters.keys()}: self.skipTest("tokenizer doesn't accept chat templates at input") tokenizer.chat_template = "test template" with tempfile.TemporaryDirectory() as tmpdirname: tokenizer.save_pretrained(tmpdirname) self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file()) self.assertFalse(Path(tmpdirname, "chat_template.json").is_file()) self.assertFalse(Path(tmpdirname, "additional_chat_templates").is_dir()) reloaded_tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) self.assertEqual(tokenizer.chat_template, reloaded_tokenizer.chat_template) # When we save as single files, tokenizers and tokenizers share a chat template, which means # the reloaded tokenizer should get the chat template as well self.assertEqual(reloaded_tokenizer.chat_template, reloaded_tokenizer.tokenizer.chat_template) with tempfile.TemporaryDirectory() as tmpdirname: tokenizer.chat_template = {"default": "a", "secondary": "b"} tokenizer.save_pretrained(tmpdirname) self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file()) self.assertFalse(Path(tmpdirname, "chat_template.json").is_file()) self.assertTrue(Path(tmpdirname, "additional_chat_templates").is_dir()) reloaded_tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) self.assertEqual(tokenizer.chat_template, reloaded_tokenizer.chat_template) # When we save as single files, tokenizers and tokenizers share a chat template, which means # the reloaded tokenizer should get the chat template as well self.assertEqual(reloaded_tokenizer.chat_template, reloaded_tokenizer.tokenizer.chat_template) with tempfile.TemporaryDirectory() as tmpdirname: tokenizer.chat_template = {"default": "a", "secondary": "b"} tokenizer.save_pretrained(tmpdirname, save_jinja_files=False) self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file()) self.assertFalse(Path(tmpdirname, "chat_template.json").is_file()) self.assertFalse(Path(tmpdirname, "additional_chat_templates").is_dir()) reloaded_tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) self.assertEqual(tokenizer.chat_template, reloaded_tokenizer.chat_template) # When we save as single files, tokenizers and tokenizers share a chat template, which means # the reloaded tokenizer should get the chat template as well self.assertEqual(reloaded_tokenizer.chat_template, reloaded_tokenizer.tokenizer.chat_template) @require_jinja def test_chat_template_batched(self): dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}" dummy_conversations = [ [ {"role": "system", "content": "system message"}, {"role": "user", "content": "user message"}, {"role": "assistant", "content": "assistant message"}, ], [ {"role": "system", "content": "system message 2"}, {"role": "user", "content": "user message 2"}, {"role": "assistant", "content": "assistant message 2"}, ], ] tokenizer = self.get_tokenizer() output = tokenizer.apply_chat_template(dummy_conversations, chat_template=dummy_template, tokenize=False) self.assertEqual( output, [ "systemsystem messageuseruser messageassistantassistant message", "systemsystem message 2useruser message 2assistantassistant message 2", ], ) one_element_output = tokenizer.apply_chat_template( dummy_conversations[:1], chat_template=dummy_template, tokenize=False ) self.assertEqual( one_element_output, ["systemsystem messageuseruser messageassistantassistant message"] ) # Assert that list structure is retained even with one element tokenizer.apply_chat_template( dummy_conversations, chat_template=dummy_template, tokenize=True ) # Check that no error raised @require_jinja def test_jinja_loopcontrols(self): break_template = """ {%- for message in messages %} {{- message.role + " " + message.content }} {%- if loop.first %} {%- break %} {%- endif %} {%- endfor %}""".strip() dummy_conversation = [ {"role": "system", "content": "1"}, {"role": "user", "content": "2"}, {"role": "assistant", "content": "3"}, ] tokenizer = self.get_tokenizer() break_output = tokenizer.apply_chat_template(dummy_conversation, chat_template=break_template, tokenize=False) self.assertEqual(break_output, "system 1") # Loop should break after first iter @require_jinja def test_jinja_strftime(self): strftime_template = """{{- strftime_now("%Y-%m-%d") }}""".strip() dummy_conversation = [ {"role": "system", "content": "1"}, {"role": "user", "content": "2"}, {"role": "assistant", "content": "3"}, ] tokenizer = self.get_tokenizer() strftime_output = tokenizer.apply_chat_template( dummy_conversation, chat_template=strftime_template, tokenize=False ) # Assert that we get a date formatted as expected self.assertEqual(len(strftime_output), 10) self.assertEqual(len(strftime_output.split("-")), 3) @require_torch @require_jinja def test_chat_template_return_assistant_tokens_mask(self): dummy_template = ( "{% for message in messages %}" "{% if (message['role'] != 'assistant') %}" "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" "{% elif (message['role'] == 'assistant')%}" "{{'<|im_start|>' + message['role'] + '\n'}}" "{% generation %}" "{{message['content'] + '<|im_end|>'}}" "{% endgeneration %}" "{{'\n'}}" "{% endif %}" "{% endfor %}" ) conversations = [ [ {"role": "system", "content": "system message"}, {"role": "user", "content": "user message"}, {"role": "assistant", "content": "start turn 1 assistant message. end turn 1"}, {"role": "user", "content": "user message 2"}, {"role": "assistant", "content": "start turn 2 assistant message. end turn 2"}, ], [ {"role": "system", "content": "system message 3"}, {"role": "user", "content": "user message 3"}, {"role": "assistant", "content": "start turn 3 assistant message. end turn 3"}, {"role": "user", "content": "user message 4"}, {"role": "assistant", "content": "start turn 4 assistant message. end turn 4"}, ], ] # These are the prefix and suffix strings of all the assistant messages. Used to find the assistant substring # in the entire chat string, and then find the corresponding tokens in the tokenized output. assistant_prefix_suffix = [ [("start turn 1", "end turn 1<|im_end|>"), ("start turn 2", "end turn 2<|im_end|>")], [("start turn 3", "end turn 3<|im_end|>"), ("start turn 4", "end turn 4<|im_end|>")], ] for tokenizer, pretrained_name, _ in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.get_tokenizer(pretrained_name) if tokenizer_r.backend != "tokenizers": self.skipTest(reason="Custom backend tokenizer") self._check_no_pad_token_padding(tokenizer_r, conversations) tokenizer_r.padding_side = "right" # check batched output = tokenizer_r.apply_chat_template( conversations, chat_template=dummy_template, tokenize=True, return_assistant_tokens_mask=True, return_dict=True, ) output_pt = tokenizer_r.apply_chat_template( conversations, chat_template=dummy_template, tokenize=True, padding=True, return_assistant_tokens_mask=True, return_dict=True, return_tensors="pt", ) self.assertEqual(type(output_pt["assistant_masks"]), torch.Tensor) self.assertEqual(output_pt["assistant_masks"].shape, output_pt["input_ids"].shape) for i, conv in enumerate(conversations): chat_string = tokenizer_r.apply_chat_template(conv, tokenize=False, chat_template=dummy_template) assistant_start = output.char_to_token(i, chat_string.index(assistant_prefix_suffix[i][0][0])) assistant_end = output.char_to_token( i, chat_string.index(assistant_prefix_suffix[i][0][1]) + len(assistant_prefix_suffix[i][0][1]) - 1, ) assistant_start2 = output.char_to_token(i, chat_string.index(assistant_prefix_suffix[i][1][0])) assistant_end2 = output.char_to_token( i, chat_string.index(assistant_prefix_suffix[i][1][1]) + len(assistant_prefix_suffix[i][1][1]) - 1, ) if ( assistant_start is None or assistant_end is None or assistant_start2 is None or assistant_end2 is None ): continue # assert 1 in first assistant message self.assertEqual( output["assistant_masks"][i][assistant_start : assistant_end + 1], [1] * (assistant_end - assistant_start + 1), ) self.assertTrue( (output_pt["assistant_masks"][i, assistant_start : assistant_end + 1] == 1).all(), ) # assert 1 second assistant message self.assertEqual( output["assistant_masks"][i][assistant_start2 : assistant_end2 + 1], [1] * (assistant_end2 - assistant_start2 + 1), ) self.assertTrue( (output_pt["assistant_masks"][i, assistant_start2 : assistant_end2 + 1] == 1).all(), ) # assert 0 in user/system indices self.assertEqual(output["assistant_masks"][i][:assistant_start], [0] * assistant_start) self.assertTrue((output_pt["assistant_masks"][i, :assistant_start] == 0).all()) self.assertEqual( output["assistant_masks"][i][assistant_end + 1 : assistant_start2], [0] * (assistant_start2 - assistant_end - 1), ) self.assertTrue( (output_pt["assistant_masks"][i, assistant_end + 1 : assistant_start2] == 0).all(), ) # check not batched output = tokenizer_r.apply_chat_template( conversations[0], chat_template=dummy_template, tokenize=True, return_assistant_tokens_mask=True, return_dict=True, ) output_pt = tokenizer_r.apply_chat_template( conversations[0], chat_template=dummy_template, tokenize=True, return_assistant_tokens_mask=True, return_dict=True, return_tensors="pt", ) self.assertEqual(type(output_pt["assistant_masks"]), torch.Tensor) self.assertEqual(output_pt["assistant_masks"].shape, output_pt["input_ids"].shape) chat_string = tokenizer_r.apply_chat_template( conversations[0], tokenize=False, chat_template=dummy_template ) assistant_start = output.char_to_token(0, chat_string.index(assistant_prefix_suffix[0][0][0])) assistant_end = output.char_to_token( 0, chat_string.index(assistant_prefix_suffix[0][0][1]) + len(assistant_prefix_suffix[0][0][1]) - 1 ) assistant_start2 = output.char_to_token(0, chat_string.index(assistant_prefix_suffix[0][1][0])) assistant_end2 = output.char_to_token( 0, chat_string.index(assistant_prefix_suffix[0][1][1]) + len(assistant_prefix_suffix[0][1][1]) - 1 ) if ( assistant_start is None or assistant_end is None or assistant_start2 is None or assistant_end2 is None ): return # assert 1 in assistant indices self.assertEqual( output["assistant_masks"][assistant_start : assistant_end + 1], [1] * (assistant_end - assistant_start + 1), ) self.assertTrue( (output_pt["assistant_masks"][assistant_start : assistant_end + 1] == 1).all(), ) self.assertEqual( output["assistant_masks"][assistant_start2 : assistant_end2 + 1], [1] * (assistant_end2 - assistant_start2 + 1), ) self.assertTrue( (output_pt["assistant_masks"][assistant_start2 : assistant_end2 + 1] == 1).all(), ) # assert 0 in user/system indices self.assertEqual(output["assistant_masks"][:assistant_start], [0] * assistant_start) self.assertTrue((output_pt["assistant_masks"][0, :assistant_start] == 0).all()) self.assertEqual( output["assistant_masks"][assistant_end + 1 : assistant_start2], [0] * (assistant_start2 - assistant_end - 1), ) self.assertTrue( (output_pt["assistant_masks"][0, assistant_end + 1 : assistant_start2] == 0).all(), ) @require_jinja def test_chat_template_return_assistant_tokens_mask_truncated(self): dummy_template = ( "{% for message in messages %}" "{% if (message['role'] != 'assistant') %}" "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" "{% elif (message['role'] == 'assistant')%}" "{{'<|im_start|>' + message['role'] + '\n'}}" "{% generation %}" "{{message['content'] + '<|im_end|>'}}" "{% endgeneration %}" "{{'\n'}}" "{% endif %}" "{% endfor %}" ) conversations = [ [ {"role": "system", "content": "system message"}, {"role": "user", "content": "user message"}, { "role": "assistant", "content": ( "start turn assistant. long string to be truncated, long string to be truncated, " "long string to be truncated, long string to be truncated, long string to be truncated" ), }, {"role": "user", "content": "another user message"}, ], [ {"role": "system", "content": "system message"}, {"role": "user", "content": "user message"}, { "role": "assistant", "content": ( "start turn assistant. long string to be truncated, long string to be truncated, " "long string to be truncated, long string to be truncated, long string to be truncated" ), }, {"role": "user", "content": "another user message"}, ], ] for tokenizer, pretrained_name, _ in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.get_tokenizer(pretrained_name) if tokenizer_r.backend != "tokenizers": self.skipTest(reason="Custom backend tokenizer") # Find where to truncate, as the amount of tokens is different for different tokenizers and I want the # truncation to happen in the middle of the assistant content. full_encoding = tokenizer_r.apply_chat_template( conversations[0], chat_template=dummy_template, tokenize=True, return_dict=True, ) chat_string = tokenizer_r.apply_chat_template( conversations[0], tokenize=False, chat_template=dummy_template ) truncation_position = full_encoding.char_to_token(chat_string.index(", long string to be truncated,")) if truncation_position is None: self.skipTest("char_to_token returned None, cannot determine truncation position") # check batched output = tokenizer_r.apply_chat_template( conversations, chat_template=dummy_template, tokenize=True, return_assistant_tokens_mask=True, max_length=truncation_position, truncation=True, return_dict=True, ) for i, conv in enumerate(conversations): chat_string = tokenizer_r.apply_chat_template(conv, tokenize=False, chat_template=dummy_template) assistant_start = output.char_to_token(i, chat_string.index("start turn assistant")) if assistant_start is None: continue # assert 1 from assistant_start to the end because the rest is truncated. self.assertEqual( output["assistant_masks"][i][assistant_start:], [1] * (len(output["assistant_masks"][i]) - assistant_start), ) # check not batched output = tokenizer_r.apply_chat_template( conversations[0], chat_template=dummy_template, tokenize=True, return_assistant_tokens_mask=True, return_dict=True, max_length=truncation_position, truncation=True, ) chat_string = tokenizer_r.apply_chat_template( conversations[0], tokenize=False, chat_template=dummy_template ) assistant_start = output.char_to_token(0, chat_string.index("start turn assistant")) if assistant_start is None: return # assert 1 from assistant_start to the end because the rest is truncated. self.assertEqual( output["assistant_masks"][assistant_start:], [1] * (len(output["assistant_masks"]) - assistant_start), ) @require_jinja def test_continue_final_message(self): dummy_template = """ {%- for message in messages %} {{- "<|im_start|>" + message['role'] + "\n" + message['content'] + "<|im_end|>" + "\n"}} {%- endfor %}""" dummy_conversation = [ {"role": "system", "content": "system message"}, {"role": "user", "content": "user message"}, {"role": "assistant", "content": "assistant message"}, ] tokenizer = self.get_tokenizer() output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=False ) self.assertEqual( output, "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message<|im_end|>\n", ) prefill_output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True ) # Assert that the final message is unterminated self.assertEqual( prefill_output, "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message", ) @require_jinja def test_continue_final_message_with_trim(self): """Regression test for chat templates with trimming: https://github.com/huggingface/transformers/pull/34214""" dummy_template = """ {%- for message in messages %} {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>" + "\n"}} {%- endfor %}""" dummy_conversation = [ {"role": "system", "content": "system message"}, {"role": "user", "content": "user message"}, {"role": "assistant", "content": "assistant message "}, # Note the trailing whitespace ] tokenizer = self.get_tokenizer() output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=False ) self.assertEqual( output, "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message<|im_end|>\n", ) prefill_output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True ) # Assert that the final message is unterminated self.assertEqual( prefill_output, "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message", ) @require_jinja def test_continue_final_message_with_decoy_earlier_message(self): """Regression test for chat templates where an earlier message has similar content to the final message https://github.com/huggingface/transformers/issues/35433""" dummy_template = """ {%- for message in messages %} {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>" + "\n"}} {%- endfor %}""" dummy_conversation = [ {"role": "user", "content": "hi 0"}, {"role": "assistant", "content": "bye: 0"}, {"role": "user", "content": "hi 1"}, {"role": "assistant", "content": "bye: "}, ] tokenizer = self.get_tokenizer() prefill_output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True ) # Assert that the final message is unterminated self.assertEqual( prefill_output, "<|im_start|>user\nhi 0<|im_end|>\n<|im_start|>assistant\nbye: 0<|im_end|>\n<|im_start|>user\nhi 1<|im_end|>\n<|im_start|>assistant\nbye:", ) @require_jinja def test_chat_template_dict(self): dummy_template_1 = "{{'a'}}" dummy_template_2 = "{{'b'}}" dummy_conversation = [ {"role": "user", "content": "user message"}, ] tokenizer = self.get_tokenizer() tokenizer.chat_template = {"template1": dummy_template_1, "template2": dummy_template_2} output1 = tokenizer.apply_chat_template(dummy_conversation, chat_template=dummy_template_1, tokenize=False) output1_via_dict = tokenizer.apply_chat_template(dummy_conversation, chat_template="template1", tokenize=False) self.assertEqual(output1, output1_via_dict) output2 = tokenizer.apply_chat_template(dummy_conversation, chat_template=dummy_template_2, tokenize=False) output2_via_dict = tokenizer.apply_chat_template(dummy_conversation, chat_template="template2", tokenize=False) self.assertEqual(output2, output2_via_dict) @require_jinja def test_chat_template_dict_saving(self): dummy_template_1 = "{{'a'}}" dummy_template_2 = "{{'b'}}" tokenizer = self.get_tokenizer() for save_jinja_files in (True, False): tokenizer.chat_template = {"default": dummy_template_1, "template2": dummy_template_2} with tempfile.TemporaryDirectory() as tmp_dir_name: # Test that save_jinja_files is ignored when there's a dict of multiple templates tokenizer.save_pretrained(tmp_dir_name, save_jinja_files=save_jinja_files) if save_jinja_files: config_dict = json.load(open(os.path.join(tmp_dir_name, "tokenizer_config.json"))) self.assertNotIn("chat_template", config_dict) self.assertTrue(os.path.exists(os.path.join(tmp_dir_name, "chat_template.jinja"))) self.assertTrue( os.path.exists(os.path.join(tmp_dir_name, "additional_chat_templates/template2.jinja")) ) else: config_dict = json.load(open(os.path.join(tmp_dir_name, "tokenizer_config.json"))) # Assert that chat templates are correctly serialized as lists of dictionaries self.assertEqual( config_dict["chat_template"], [ {"name": "default", "template": "{{'a'}}"}, {"name": "template2", "template": "{{'b'}}"}, ], ) self.assertFalse(os.path.exists(os.path.join(tmp_dir_name, "chat_template.jinja"))) new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) # Assert that the serialized list is correctly reconstructed as a single dict self.assertEqual(new_tokenizer.chat_template, tokenizer.chat_template) @require_jinja def test_chat_template_file_priority(self): dummy_template1 = "a" dummy_template2 = "b" tokenizer = self.get_tokenizer() with tempfile.TemporaryDirectory() as tmp_dir_name: tokenizer.chat_template = dummy_template1 tokenizer.save_pretrained(tmp_dir_name, save_jinja_files=False) with Path(tmp_dir_name, "chat_template.jinja").open("w") as f: f.write(dummy_template2) new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) # Assert the file template clobbers any template in the config self.assertEqual(new_tokenizer.chat_template, dummy_template2) def test_number_of_added_tokens(self): tokenizer = self.get_tokenizer(do_lower_case=False) seq_0 = "Test this method." seq_1 = "With these inputs." sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False) attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) # Method is implemented (e.g. not GPT-2) if len(attached_sequences) != 2: self.assertEqual(tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)) def test_maximum_encoding_length_single_input(self): tokenizer = self.get_tokenizer(do_lower_case=False, model_max_length=100) seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20) sequence = tokenizer.encode(seq_0, add_special_tokens=False) total_length = len(sequence) self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it, it's too short") # Test with max model input length model_max_length = tokenizer.model_max_length self.assertEqual(model_max_length, 100) seq_1 = seq_0 * model_max_length sequence1 = tokenizer(seq_1, add_special_tokens=False) total_length1 = len(sequence1["input_ids"]) self.assertGreater( total_length1, model_max_length, "Issue with the testing sequence, please update it, it's too short", ) # Simple padding_strategies = ( [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False] ) for padding_state in padding_strategies: with self.subTest(f"Padding: {padding_state}"): for truncation_state in [True, "longest_first", "only_first"]: with self.subTest(f"Truncation: {truncation_state}"): output = tokenizer(seq_1, padding=padding_state, truncation=truncation_state) self.assertEqual(len(output["input_ids"]), model_max_length) output = tokenizer([seq_1], padding=padding_state, truncation=truncation_state) self.assertEqual(len(output["input_ids"][0]), model_max_length) # Simple with no truncation # Reset warnings tokenizer.deprecation_warnings = {} with self.assertLogs("transformers", level="WARNING") as cm: output = tokenizer(seq_1, padding=padding_state, truncation=False) self.assertNotEqual(len(output["input_ids"]), model_max_length) self.assertEqual(len(cm.records), 1) self.assertTrue( cm.records[0].message.startswith( "Token indices sequence length is longer than the specified maximum sequence length" " for this model" ) ) tokenizer.deprecation_warnings = {} with self.assertLogs("transformers", level="WARNING") as cm: output = tokenizer([seq_1], padding=padding_state, truncation=False) self.assertNotEqual(len(output["input_ids"][0]), model_max_length) self.assertEqual(len(cm.records), 1) self.assertTrue( cm.records[0].message.startswith( "Token indices sequence length is longer than the specified maximum sequence length" " for this model" ) ) # Overflowing tokens stride = 2 information = tokenizer( seq_0, max_length=total_length - 2, add_special_tokens=False, stride=stride, truncation="longest_first", return_overflowing_tokens=True, # add_prefix_space=False, ) # Overflowing tokens are handled quite differently in slow and fast tokenizers if isinstance(tokenizer, TokenizersBackend): truncated_sequence = information["input_ids"][0] overflowing_tokens = information["input_ids"][1] self.assertEqual(len(information["input_ids"]), 2) self.assertEqual(len(truncated_sequence), total_length - 2) self.assertEqual(truncated_sequence, sequence[:-2]) self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :]) else: truncated_sequence = information["input_ids"] overflowing_tokens = information["overflowing_tokens"] self.assertEqual(len(truncated_sequence), total_length - 2) self.assertEqual(truncated_sequence, sequence[:-2]) self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :]) def test_maximum_encoding_length_pair_input(self): tokenizer = self.get_tokenizer(do_lower_case=False, model_max_length=100) # Build a sequence from our model's vocabulary stride = 2 seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20) if len(ids) <= 2 + stride: seq_0 = (seq_0 + " ") * (2 + stride) ids = None seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False) self.assertGreater(len(seq0_tokens), 2 + stride) seq_1 = "This is another sentence to be encoded." seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False) if abs(len(seq0_tokens) - len(seq1_tokens)) <= 2: seq1_tokens = seq1_tokens + seq1_tokens seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False) seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False) self.assertGreater(len(seq1_tokens), 2 + stride) smallest = seq1_tokens if len(seq0_tokens) > len(seq1_tokens) else seq0_tokens # We are not using the special tokens - a bit too hard to test all the tokenizers with this # TODO try this again later sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=False) # , add_prefix_space=False) # Test with max model input length model_max_length = tokenizer.model_max_length self.assertEqual(model_max_length, 100) seq_2 = seq_0 * model_max_length self.assertGreater(len(seq_2), model_max_length) sequence1 = tokenizer(seq_1, add_special_tokens=False) total_length1 = len(sequence1["input_ids"]) sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False) total_length2 = len(sequence2["input_ids"]) self.assertLess(total_length1, model_max_length - 10, "Issue with the testing sequence, please update it.") self.assertGreater(total_length2, model_max_length, "Issue with the testing sequence, please update it.") # Simple padding_strategies = ( [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False] ) for padding_state in padding_strategies: with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"): for truncation_state in [True, "longest_first", "only_first"]: with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"): output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state) self.assertEqual(len(output["input_ids"]), model_max_length) output = tokenizer([seq_2], [seq_1], padding=padding_state, truncation=truncation_state) self.assertEqual(len(output["input_ids"][0]), model_max_length) # Simple output = tokenizer(seq_1, seq_2, padding=padding_state, truncation="only_second") self.assertEqual(len(output["input_ids"]), model_max_length) output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation="only_second") self.assertEqual(len(output["input_ids"][0]), model_max_length) # Simple with no truncation # Reset warnings tokenizer.deprecation_warnings = {} with self.assertLogs("transformers", level="WARNING") as cm: output = tokenizer(seq_1, seq_2, padding=padding_state, truncation=False) self.assertNotEqual(len(output["input_ids"]), model_max_length) self.assertEqual(len(cm.records), 1) self.assertTrue( cm.records[0].message.startswith( "Token indices sequence length is longer than the specified maximum sequence length" " for this model" ) ) tokenizer.deprecation_warnings = {} with self.assertLogs("transformers", level="WARNING") as cm: output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation=False) self.assertNotEqual(len(output["input_ids"][0]), model_max_length) self.assertEqual(len(cm.records), 1) self.assertTrue( cm.records[0].message.startswith( "Token indices sequence length is longer than the specified maximum sequence length" " for this model" ) ) truncated_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[:-2] + tokenizer.encode( seq_1, add_special_tokens=False ) truncated_second_sequence = ( tokenizer.encode(seq_0, add_special_tokens=False) + tokenizer.encode(seq_1, add_special_tokens=False)[:-2] ) truncated_longest_sequence = ( truncated_first_sequence if len(seq0_tokens) > len(seq1_tokens) else truncated_second_sequence ) overflow_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[ -(2 + stride) : ] + tokenizer.encode(seq_1, add_special_tokens=False) overflow_second_sequence = ( tokenizer.encode(seq_0, add_special_tokens=False) + tokenizer.encode(seq_1, add_special_tokens=False)[-(2 + stride) :] ) overflow_longest_sequence = ( overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence ) # Overflowing tokens are handled quite differently in slow and fast tokenizers if isinstance(tokenizer, TokenizersBackend): information = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=False, stride=stride, truncation="longest_first", return_overflowing_tokens=True, # add_prefix_space=False, ) truncated_sequence = information["input_ids"][0] overflowing_tokens = information["input_ids"][1] self.assertEqual(len(information["input_ids"]), 2) self.assertEqual(len(truncated_sequence), len(sequence) - 2) self.assertEqual(truncated_sequence, truncated_longest_sequence) self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest)) self.assertEqual(overflowing_tokens, overflow_longest_sequence) else: # No overflowing tokens when using 'longest' in python tokenizers with self.assertRaises(ValueError) as context: information = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=False, stride=stride, truncation="longest_first", return_overflowing_tokens=True, # add_prefix_space=False, ) self.assertTrue( context.exception.args[0].startswith( "Not possible to return overflowing tokens for pair of sequences with the " "`longest_first`. Please select another truncation strategy than `longest_first`, " "for instance `only_second` or `only_first`." ) ) # Overflowing tokens are handled quite differently in slow and fast tokenizers if isinstance(tokenizer, TokenizersBackend): information = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=False, stride=stride, truncation=True, return_overflowing_tokens=True, # add_prefix_space=False, ) truncated_sequence = information["input_ids"][0] overflowing_tokens = information["input_ids"][1] self.assertEqual(len(information["input_ids"]), 2) self.assertEqual(len(truncated_sequence), len(sequence) - 2) self.assertEqual(truncated_sequence, truncated_longest_sequence) self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest)) self.assertEqual(overflowing_tokens, overflow_longest_sequence) else: # No overflowing tokens when using 'longest' in python tokenizers with self.assertRaises(ValueError) as context: information = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=False, stride=stride, truncation=True, return_overflowing_tokens=True, # add_prefix_space=False, ) self.assertTrue( context.exception.args[0].startswith( "Not possible to return overflowing tokens for pair of sequences with the " "`longest_first`. Please select another truncation strategy than `longest_first`, " "for instance `only_second` or `only_first`." ) ) information_first_truncated = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=False, stride=stride, truncation="only_first", return_overflowing_tokens=True, # add_prefix_space=False, ) # Overflowing tokens are handled quite differently in slow and fast tokenizers if isinstance(tokenizer, TokenizersBackend): truncated_sequence = information_first_truncated["input_ids"][0] overflowing_tokens = information_first_truncated["input_ids"][1] self.assertEqual(len(information_first_truncated["input_ids"]), 2) self.assertEqual(len(truncated_sequence), len(sequence) - 2) self.assertEqual(truncated_sequence, truncated_first_sequence) self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq1_tokens)) self.assertEqual(overflowing_tokens, overflow_first_sequence) else: truncated_sequence = information_first_truncated["input_ids"] overflowing_tokens = information_first_truncated["overflowing_tokens"] self.assertEqual(len(truncated_sequence), len(sequence) - 2) self.assertEqual(truncated_sequence, truncated_first_sequence) self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, seq0_tokens[-(2 + stride) :]) information_second_truncated = tokenizer( seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=False, stride=stride, truncation="only_second", return_overflowing_tokens=True, # add_prefix_space=False, ) # Overflowing tokens are handled quite differently in slow and fast tokenizers if isinstance(tokenizer, TokenizersBackend): truncated_sequence = information_second_truncated["input_ids"][0] overflowing_tokens = information_second_truncated["input_ids"][1] self.assertEqual(len(information_second_truncated["input_ids"]), 2) self.assertEqual(len(truncated_sequence), len(sequence) - 2) self.assertEqual(truncated_sequence, truncated_second_sequence) self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq0_tokens)) self.assertEqual(overflowing_tokens, overflow_second_sequence) else: truncated_sequence = information_second_truncated["input_ids"] overflowing_tokens = information_second_truncated["overflowing_tokens"] self.assertEqual(len(truncated_sequence), len(sequence) - 2) self.assertEqual(truncated_sequence, truncated_second_sequence) self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :]) def test_special_tokens_mask(self): tokenizer = self.get_tokenizer(do_lower_case=False) sequence_0 = "Encode this." # Testing single inputs encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) encoded_sequence_dict = tokenizer( sequence_0, add_special_tokens=True, return_special_tokens_mask=True, # , add_prefix_space=False ) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]] self.assertEqual(encoded_sequence, filtered_sequence) def test_special_tokens_mask_input_pairs(self): tokenizer = self.get_tokenizer(do_lower_case=False) sequence_0 = "Encode this." sequence_1 = "This one too please." encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False) encoded_sequence_dict = tokenizer( sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True, # add_prefix_space=False, ) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) filtered_sequence = [ (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) ] filtered_sequence = [x for x in filtered_sequence if x is not None] self.assertEqual(encoded_sequence, filtered_sequence) def test_padding_side_in_kwargs(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.get_tokenizer(pretrained_name, padding_side="left", **kwargs) self.assertEqual(tokenizer_r.padding_side, "left") tokenizer_r = self.get_tokenizer(pretrained_name, padding_side="right", **kwargs) self.assertEqual(tokenizer_r.padding_side, "right") self.assertRaises( ValueError, self.tokenizer_class.from_pretrained, pretrained_name, padding_side="unauthorized", **kwargs, ) def test_truncation_side_in_kwargs(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.get_tokenizer(pretrained_name, truncation_side="left", **kwargs) self.assertEqual(tokenizer_r.truncation_side, "left") tokenizer_r = self.get_tokenizer(pretrained_name, truncation_side="right", **kwargs) self.assertEqual(tokenizer_r.truncation_side, "right") self.assertRaises( ValueError, self.tokenizer_class.from_pretrained, pretrained_name, truncation_side="unauthorized", **kwargs, ) def test_encode_basic_padding(self): """Test basic left/right padding behavior using encode() method with max_length strategy.""" tokenizer = self.get_tokenizer(do_lower_case=False) sequence = "Sequence" padding_size = 10 # check correct behaviour if no pad_token_id exists and add it eventually self._check_no_pad_token_padding(tokenizer, sequence) padding_idx = tokenizer.pad_token_id # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True tokenizer.padding_side = "right" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding="max_length") padded_sequence_length = len(padded_sequence) self.assertEqual(sequence_length + padding_size, padded_sequence_length) self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence) # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True tokenizer.padding_side = "left" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding="max_length") padded_sequence_length = len(padded_sequence) self.assertEqual(sequence_length + padding_size, padded_sequence_length) self.assertEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence) def test_right_and_left_truncation(self): tokenizer = self.get_tokenizer(do_lower_case=False) sequence = "This is a test sequence" # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True truncation_size = 3 tokenizer.truncation_side = "right" encoded_sequence = tokenizer.encode(sequence, add_special_tokens=False) sequence_length = len(encoded_sequence) # Remove EOS/BOS tokens truncated_sequence = tokenizer.encode( sequence, max_length=sequence_length - truncation_size, truncation=True, add_special_tokens=False ) truncated_sequence_length = len(truncated_sequence) self.assertEqual(sequence_length, truncated_sequence_length + truncation_size) self.assertEqual(encoded_sequence[:-truncation_size], truncated_sequence) # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the truncation flag set to True tokenizer.truncation_side = "left" sequence_length = len(encoded_sequence) truncated_sequence = tokenizer.encode( sequence, max_length=sequence_length - truncation_size, truncation=True, add_special_tokens=False ) truncated_sequence_length = len(truncated_sequence) self.assertEqual(sequence_length, truncated_sequence_length + truncation_size) self.assertEqual(encoded_sequence[truncation_size:], truncated_sequence) # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_truncation' sequence_length = len(encoded_sequence) tokenizer.truncation_side = "right" truncated_sequence_right = tokenizer.encode(sequence, truncation=True, add_special_tokens=False) truncated_sequence_right_length = len(truncated_sequence_right) self.assertEqual(sequence_length, truncated_sequence_right_length) self.assertEqual(encoded_sequence, truncated_sequence_right) tokenizer.truncation_side = "left" truncated_sequence_left = tokenizer.encode(sequence, truncation="longest_first", add_special_tokens=False) truncated_sequence_left_length = len(truncated_sequence_left) self.assertEqual(sequence_length, truncated_sequence_left_length) self.assertEqual(encoded_sequence, truncated_sequence_left) tokenizer.truncation_side = "right" truncated_sequence_right = tokenizer.encode(sequence, add_special_tokens=False) truncated_sequence_right_length = len(truncated_sequence_right) self.assertEqual(sequence_length, truncated_sequence_right_length) self.assertEqual(encoded_sequence, truncated_sequence_right) tokenizer.truncation_side = "left" truncated_sequence_left = tokenizer.encode(sequence, truncation=False, add_special_tokens=False) truncated_sequence_left_length = len(truncated_sequence_left) self.assertEqual(sequence_length, truncated_sequence_left_length) self.assertEqual(encoded_sequence, truncated_sequence_left) def test_padding_to_multiple_of(self): tokenizer = self.get_tokenizer() if tokenizer.pad_token is None: self.skipTest(reason="No padding token.") else: empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8) normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8) for key, value in empty_tokens.items(): self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") for key, value in normal_tokens.items(): self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") normal_tokens = tokenizer("This", pad_to_multiple_of=8) for key, value in normal_tokens.items(): self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") # Should also work with truncation normal_tokens = tokenizer("This", padding=True, truncation=True, pad_to_multiple_of=8) for key, value in normal_tokens.items(): self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") # truncation to something which is not a multiple of pad_to_multiple_of raises an error self.assertRaises( ValueError, tokenizer.__call__, "This", padding=True, truncation=True, max_length=12, pad_to_multiple_of=8, ) def test_padding_with_attention_mask(self): tokenizer = self.get_tokenizer() if tokenizer.pad_token is None: self.skipTest(reason="No padding token.") if "attention_mask" not in tokenizer.model_input_names: self.skipTest(reason="This model does not use attention mask.") features = [ {"input_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]}, {"input_ids": [1, 2, 3], "attention_mask": [1, 1, 0]}, ] padded_features = tokenizer.pad(features) if tokenizer.padding_side == "right": self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [1, 1, 0, 0, 0, 0]]) else: self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]]) @parameterized.expand([(True,), (False,)]) def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): """ This test checks that padding works as expected when tokenizing a sequence. Padding is expected to have no effect when the input is a single sequence and the padding-strategy is not `max_length`. Otherwise it pads to the specified max-length using tokenizer classes `padding_side` attribute. Also, we check that passing `padding_side` as call time kwarg works same way as when one sets `tokenizer.padding_side` attribute. """ tokenizer = self.get_tokenizer(do_lower_case=False) sequence = "Sequence" # check correct behaviour if no pad_token_id exists and add it eventually self._check_no_pad_token_padding(tokenizer, sequence) padding_size = 10 padding_idx = tokenizer.pad_token_id token_type_padding_idx = tokenizer.pad_token_type_id encoded_sequence = tokenizer(sequence, return_special_tokens_mask=True) input_ids = encoded_sequence["input_ids"] special_tokens_mask = encoded_sequence["special_tokens_mask"] sequence_length = len(input_ids) # Test 'longest' and 'no_padding' don't do anything not_padded_sequence = tokenizer( sequence, padding=True, return_special_tokens_mask=True, ) not_padded_input_ids = not_padded_sequence["input_ids"] not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] not_padded_sequence_length = len(not_padded_input_ids) self.assertEqual(sequence_length, not_padded_sequence_length) self.assertEqual(input_ids, not_padded_input_ids) self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask) not_padded_sequence = tokenizer( sequence, padding=False, return_special_tokens_mask=True, ) not_padded_input_ids = not_padded_sequence["input_ids"] not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] not_padded_sequence_length = len(not_padded_input_ids) self.assertEqual(sequence_length, not_padded_sequence_length) self.assertEqual(input_ids, not_padded_input_ids) self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask) # Test right padding tokenizer_kwargs_right = { "max_length": sequence_length + padding_size, "padding": "max_length", "return_special_tokens_mask": True, } if not use_padding_as_call_kwarg: tokenizer.padding_side = "right" else: tokenizer_kwargs_right["padding_side"] = "right" right_padded_sequence = tokenizer(sequence, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] right_padded_sequence_length = len(right_padded_input_ids) self.assertEqual(sequence_length + padding_size, right_padded_sequence_length) self.assertEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids) self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask) # Test left padding tokenizer_kwargs_left = { "max_length": sequence_length + padding_size, "padding": "max_length", "return_special_tokens_mask": True, } if not use_padding_as_call_kwarg: tokenizer.padding_side = "left" else: tokenizer_kwargs_left["padding_side"] = "left" left_padded_sequence = tokenizer(sequence, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) self.assertEqual(sequence_length + padding_size, left_padded_sequence_length) self.assertEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids) self.assertEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask) if "token_type_ids" in tokenizer.model_input_names: token_type_ids = encoded_sequence["token_type_ids"] left_padded_token_type_ids = left_padded_sequence["token_type_ids"] right_padded_token_type_ids = right_padded_sequence["token_type_ids"] self.assertEqual(token_type_ids + [token_type_padding_idx] * padding_size, right_padded_token_type_ids) self.assertEqual([token_type_padding_idx] * padding_size + token_type_ids, left_padded_token_type_ids) if "attention_mask" in tokenizer.model_input_names: attention_mask = encoded_sequence["attention_mask"] right_padded_attention_mask = right_padded_sequence["attention_mask"] left_padded_attention_mask = left_padded_sequence["attention_mask"] self.assertEqual(attention_mask + [0] * padding_size, right_padded_attention_mask) self.assertEqual([0] * padding_size + attention_mask, left_padded_attention_mask) def test_get_vocab(self): tokenizer = self.get_tokenizer(do_lower_case=False) vocab_dict = tokenizer.get_vocab() self.assertIsInstance(vocab_dict, dict) self.assertGreaterEqual(len(tokenizer), len(vocab_dict)) vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))] self.assertEqual(len(vocab), len(tokenizer)) tokenizer.add_tokens(["asdfasdfasdfasdf"]) vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))] self.assertEqual(len(vocab), len(tokenizer)) @slow def test_conversion_reversible(self): tokenizer = self.get_tokenizer(do_lower_case=False) vocab = tokenizer.get_vocab() for word, ind in vocab.items(): if word == tokenizer.unk_token: continue self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind) self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word) def test_call(self): # Tests that all call wrap to encode_plus tokenizer = self.get_tokenizer(do_lower_case=False) sequences = [ "Testing batch encode plus", "Testing batch encode plus with different sequence lengths", "Testing batch encode plus with different sequence lengths correctly pads", ] # Test not batched encoded_sequences_1 = tokenizer(sequences[0]) encoded_sequences_2 = tokenizer(sequences[0]) self.assertEqual(encoded_sequences_1, encoded_sequences_2) # Test not batched pairs encoded_sequences_1 = tokenizer(sequences[0], sequences[1]) encoded_sequences_2 = tokenizer(sequences[0], sequences[1]) self.assertEqual(encoded_sequences_1, encoded_sequences_2) # Test batched encoded_sequences_1 = tokenizer(sequences) encoded_sequences_2 = tokenizer(sequences) self.assertEqual(encoded_sequences_1, encoded_sequences_2) # Test batched pairs encoded_sequences_1 = tokenizer(list(zip(sequences, sequences))) encoded_sequences_2 = tokenizer(sequences, sequences) self.assertEqual(encoded_sequences_1, encoded_sequences_2) def test_batch_encode_plus_batch_sequence_length(self): # Tests that all encoded values have the correct size tokenizer = self.get_tokenizer(do_lower_case=False) sequences = [ "Testing batch encode plus", "Testing batch encode plus with different sequence lengths", "Testing batch encode plus with different sequence lengths correctly pads", ] encoded_sequences = [tokenizer(sequence) for sequence in sequences] encoded_sequences_batch = tokenizer(sequences, padding=False) self.assertListEqual( encoded_sequences, TokenizerTesterMixin.convert_batch_to_list_format(encoded_sequences_batch) ) maximum_length = len(max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)) # check correct behaviour if no pad_token_id exists and add it eventually self._check_no_pad_token_padding(tokenizer, sequences) encoded_sequences_padded = [ tokenizer(sequence, max_length=maximum_length, padding="max_length") for sequence in sequences ] encoded_sequences_batch_padded = tokenizer(sequences, padding=True) self.assertListEqual( encoded_sequences_padded, TokenizerTesterMixin.convert_batch_to_list_format(encoded_sequences_batch_padded), ) # check 'longest' is unsensitive to a max length encoded_sequences_batch_padded_1 = tokenizer(sequences, padding=True) encoded_sequences_batch_padded_2 = tokenizer(sequences, max_length=maximum_length + 10, padding="longest") for key in encoded_sequences_batch_padded_1: self.assertListEqual( encoded_sequences_batch_padded_1[key], encoded_sequences_batch_padded_2[key], ) # check 'no_padding' is unsensitive to a max length encoded_sequences_batch_padded_1 = tokenizer(sequences, padding=False) encoded_sequences_batch_padded_2 = tokenizer(sequences, max_length=maximum_length + 10, padding=False) for key in encoded_sequences_batch_padded_1: self.assertListEqual( encoded_sequences_batch_padded_1[key], encoded_sequences_batch_padded_2[key], ) def test_batch_encode_plus_padding(self): # Test that padded sequences are equivalent between batch and individual encoding # Right padding tests tokenizer = self.get_tokenizer(do_lower_case=False) sequences = [ "Testing batch encode plus", "Testing batch encode plus with different sequence lengths", "Testing batch encode plus with different sequence lengths correctly pads", ] max_length = 100 # check correct behaviour if no pad_token_id exists and add it eventually self._check_no_pad_token_padding(tokenizer, sequences) encoded_sequences = [ tokenizer(sequence, max_length=max_length, padding="max_length") for sequence in sequences ] encoded_sequences_batch = tokenizer(sequences, max_length=max_length, padding="max_length") self.assertListEqual( encoded_sequences, TokenizerTesterMixin.convert_batch_to_list_format(encoded_sequences_batch) ) # Left padding tests tokenizer = self.get_tokenizer(do_lower_case=False) tokenizer.padding_side = "left" sequences = [ "Testing batch encode plus", "Testing batch encode plus with different sequence lengths", "Testing batch encode plus with different sequence lengths correctly pads", ] max_length = 100 # check correct behaviour if no pad_token_id exists and add it eventually self._check_no_pad_token_padding(tokenizer, sequences) encoded_sequences = [ tokenizer(sequence, max_length=max_length, padding="max_length") for sequence in sequences ] encoded_sequences_batch = tokenizer(sequences, max_length=max_length, padding="max_length") self.assertListEqual( encoded_sequences, TokenizerTesterMixin.convert_batch_to_list_format(encoded_sequences_batch) ) def test_pretokenized_inputs(self): # Test when inputs are pretokenized # All methods (encode, encode_plus, __call__) go through the same code path, # so we only test __call__ tokenizer = self.get_tokenizer(do_lower_case=False) if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space: return # Prepare a sequence from our tokenizer vocabulary sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20) token_sequence = sequence.split() # Test single sequence output = tokenizer(token_sequence, is_split_into_words=True, add_special_tokens=False) output_sequence = tokenizer(sequence, add_special_tokens=False) for key in output: self.assertEqual(output[key], output_sequence[key]) output = tokenizer(token_sequence, is_split_into_words=True, add_special_tokens=True) output_sequence = tokenizer(sequence, add_special_tokens=True) for key in output: self.assertEqual(output[key], output_sequence[key]) # Test sequence pairs output = tokenizer(token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False) output_sequence = tokenizer(sequence, sequence, add_special_tokens=False) for key in output: self.assertEqual(output[key], output_sequence[key]) output = tokenizer(token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True) output_sequence = tokenizer(sequence, sequence, add_special_tokens=True) for key in output: self.assertEqual(output[key], output_sequence[key]) # Test batched inputs sequence_batch = [sequence.strip()] * 2 + [sequence.strip() + " " + sequence.strip()] token_sequence_batch = [s.split() for s in sequence_batch] sequence_batch_cleaned_up_spaces = [" " + " ".join(s) for s in token_sequence_batch] output = tokenizer(token_sequence_batch, is_split_into_words=True, add_special_tokens=False) output_sequence = tokenizer(sequence_batch_cleaned_up_spaces, add_special_tokens=False) for key in output: self.assertEqual(output[key], output_sequence[key]) output = tokenizer(token_sequence_batch, is_split_into_words=True, add_special_tokens=True) output_sequence = tokenizer(sequence_batch_cleaned_up_spaces, add_special_tokens=True) for key in output: self.assertEqual(output[key], output_sequence[key]) # Test batch_encode_plus for pretokenized inputs pairs sequence_pair_batch = [(sequence.strip(), sequence.strip())] * 2 + [ (sequence.strip() + " " + sequence.strip(), sequence.strip()) ] token_sequence_pair_batch = [tuple(s.split() for s in pair) for pair in sequence_pair_batch] sequence_pair_batch_cleaned_up_spaces = [ tuple(" " + " ".join(s) for s in pair) for pair in token_sequence_pair_batch ] output = tokenizer(token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=False) output_sequence = tokenizer(sequence_pair_batch_cleaned_up_spaces, add_special_tokens=False) for key in output: self.assertEqual(output[key], output_sequence[key]) output = tokenizer(token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=True) output_sequence = tokenizer(sequence_pair_batch_cleaned_up_spaces, add_special_tokens=True) for key in output: self.assertEqual(output[key], output_sequence[key]) def _check_no_pad_token_padding(self, tokenizer, sequences): # if tokenizer does v have pad_token_id, an error should be thrown if tokenizer.pad_token_id is None: with self.assertRaises(ValueError): if isinstance(sequences, list): tokenizer(sequences, padding="longest") else: tokenizer(sequences, padding=True) # add pad_token_id to pass subsequent tests tokenizer.add_special_tokens({"pad_token": ""}) @require_torch def test_prepare_seq2seq_batch(self): if not self.test_seq2seq: self.skipTest(reason="test_seq2seq is set to False") tokenizer = self.get_tokenizer() # Longer text that will definitely require truncation. src_text = [ " UN Chief Says There Is No Military Solution in Syria", " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for" " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons" " will only worsen the violence and misery for millions of people.", ] tgt_text = [ "Şeful ONU declară că nu există o soluţie militară în Siria", "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al" ' Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi' " că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.", ] try: batch = tokenizer( src_text, text_target=tgt_text, max_length=3, max_target_length=10, return_tensors="pt", src_lang="en_XX", # this should be ignored (for all but mbart) but not cause an error ) except NotImplementedError: self.skipTest(reason="Encountered NotImplementedError calling prepare_seq2seq_batch") self.assertEqual(batch.input_ids.shape[1], 3) self.assertEqual(batch.labels.shape[1], 10) # max_target_length will default to max_length if not specified batch = tokenizer(src_text, text_target=tgt_text, max_length=3, return_tensors="pt") self.assertEqual(batch.input_ids.shape[1], 3) self.assertEqual(batch.labels.shape[1], 3) batch_encoder_only = tokenizer(src_text, max_length=3, max_target_length=10, return_tensors="pt") self.assertEqual(batch_encoder_only.input_ids.shape[1], 3) self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) self.assertNotIn("decoder_input_ids", batch_encoder_only) def test_batch_encode_dynamic_overflowing(self): """ When calling batch_encode with multiple sequence it can returns different number of overflowing encoding for each sequence: [ Sequence 1: [Encoding 1, Encoding 2], Sequence 2: [Encoding 1], Sequence 3: [Encoding 1, Encoding 2, ... Encoding N] ] This needs to be padded so that it can represented as a tensor """ for tokenizer, pretrained_name, kwargs in self.tokenizers_list: tokenizer = self.get_tokenizer(pretrained_name, **kwargs) with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): if is_torch_available(): returned_tensor = "pt" else: self.skipTest(reason="No expected framework (PT) found") if not tokenizer.pad_token or tokenizer.pad_token_id < 0: self.skipTest(reason="This tokenizer has no padding token set, or pad_token_id < 0") tokens = tokenizer( "HuggingFace is solving NLP one commit at a time", max_length=6, padding=True, truncation=True, return_tensors=returned_tensor, return_overflowing_tokens=True, ) for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): self.assertEqual(len(tokens[key].shape), 2) # Mono sample tokens = tokenizer( ["HuggingFace is solving NLP one commit at a time"], max_length=6, padding=True, truncation="only_first", return_tensors=returned_tensor, return_overflowing_tokens=True, ) for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): self.assertEqual(len(tokens[key].shape), 2) self.assertEqual(tokens[key].shape[-1], 6) # Multi sample tokens = tokenizer( ["HuggingFace is solving NLP one commit at a time", "Very tiny input"], max_length=6, padding=True, truncation="only_first", return_tensors=returned_tensor, return_overflowing_tokens=True, ) for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): self.assertEqual(len(tokens[key].shape), 2) self.assertEqual(tokens[key].shape[-1], 6) def test_added_tokens_serialization(self): new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): # Test loading a tokenizer from the hub with a new eos token tokenizer_r = self.get_tokenizer(pretrained_name, eos_token=new_eos) self.assertEqual(tokenizer_r._special_tokens_map["eos_token"], new_eos) # Check that the token content is present (may not preserve all AddedToken attributes) self.assertIn(str(new_eos), [str(t) for t in tokenizer_r.added_tokens_decoder.values()]) EXPECTED_ADDED_TOKENS_DECODER = tokenizer_r.added_tokens_decoder # Test saving and reloading the tokenizer with tempfile.TemporaryDirectory() as tmp_dir: tokenizer_r.save_pretrained(tmp_dir) with self.subTest("Saving tokenizer locally and reloading"): tokenizer = self.tokenizer_class.from_pretrained(tmp_dir) self.assertTrue(str(new_eos) not in tokenizer.extra_special_tokens) # Check that the token content is present (may not preserve all AddedToken attributes) self.assertIn(str(new_eos), [str(t) for t in tokenizer.added_tokens_decoder.values()]) self.assertEqual(str(tokenizer.added_tokens_decoder[tokenizer.eos_token_id]), str(new_eos)) # Check that all original tokens are still present (by string representation) expected_tokens = {str(t) for t in EXPECTED_ADDED_TOKENS_DECODER.values()} actual_tokens = {str(t) for t in tokenizer.added_tokens_decoder.values()} self.assertTrue(expected_tokens.issubset(actual_tokens)) def test_tokenizer_initialization_with_conflicting_key(self): with self.assertRaises(AttributeError, msg="conflicts with the method"): self.get_tokenizer(add_special_tokens=True) with self.assertRaises(AttributeError, msg="conflicts with the method"): self.get_tokenizer(get_vocab=True) def test_empty_input_string(self): empty_input_string = "" tokenizer_return_type = [] output_tensor_type = [] if is_torch_available(): import numpy as np import torch tokenizer_return_type.append("pt") output_tensor_type.append(torch.int64) tokenizer_return_type.append("np") output_tensor_type.append(np.int64) if is_mlx_available(): import mlx.core as mx tokenizer_return_type.append("mlx") output_tensor_type.append(mx.int32) if len(tokenizer_return_type) == 0: self.skipTest(reason="No expected framework from PT, or MLX found") tokenizer = self.get_tokenizer() for return_type, target_type in zip(tokenizer_return_type, output_tensor_type): output = tokenizer(empty_input_string, return_tensors=return_type) self.assertEqual(output.input_ids.dtype, target_type) def test_pad_token_initialization(self): """Test that passing pad_token when creating a tokenizer works correctly.""" tokenizer = self.get_tokenizer(pad_token="[PAD]") # Verify the pad_token was set correctly self.assertEqual(tokenizer.pad_token, "[PAD]") self.assertIsNotNone(tokenizer.pad_token_id) # Test with two sequences of different lengths to trigger padding seq_0 = "Test this method." seq_1 = "With these inputs and some extra tokens here." # Test padding works with the custom pad_token output_with_padding = tokenizer( [seq_0, seq_1], padding=True, return_attention_mask=True, ) # Check that sequences were padded to the same length self.assertEqual( len(output_with_padding["input_ids"][0]), len(output_with_padding["input_ids"][1]), ) # Check that attention mask has 0s where padding was added (on the shorter sequence) # Find the shorter sequence unpadded_lengths = [ len(tokenizer(seq_0, add_special_tokens=True)["input_ids"]), len(tokenizer(seq_1, add_special_tokens=True)["input_ids"]), ] shorter_idx = 0 if unpadded_lengths[0] < unpadded_lengths[1] else 1 self.assertIn(0, output_with_padding["attention_mask"][shorter_idx]) def test_bos_token_with_add_bos_token_true(self): """Test that passing bos_token with add_bos_token=True during initialization adds the BOS token.""" try: tokenizer = self.get_tokenizer(bos_token="", add_bos_token=True) except TypeError: # Some tokenizers might not support add_bos_token parameter self.skipTest("Tokenizer does not support add_bos_token parameter") test_string = "Hello world" # Verify bos_token was set self.assertEqual(tokenizer.bos_token, "") # Verify the tokenizer was created successfully with these parameters output = tokenizer(test_string, add_special_tokens=False) self.assertIsNotNone(output["input_ids"]) def test_bos_token_with_add_bos_token_false(self): """Test that passing bos_token with add_bos_token=False during initialization does not add the BOS token.""" try: tokenizer = self.get_tokenizer(bos_token="", add_bos_token=False) except TypeError: # Some tokenizers might not support add_bos_token parameter self.skipTest("Tokenizer does not support add_bos_token parameter") test_string = "Hello world" # Verify bos_token was set self.assertEqual(tokenizer.bos_token, "") # Verify the tokenizer was created successfully with these parameters output = tokenizer(test_string, add_special_tokens=False) self.assertIsNotNone(output["input_ids"]) def test_local_files_only(self): from transformers import AutoTokenizer pretrained_list = getattr(self, "from_pretrained_id", []) or [] for pretrained_name in pretrained_list: with self.subTest(f"AutoTokenizer ({pretrained_name})"): # First cache the tokenizer files try: tokenizer_cached = AutoTokenizer.from_pretrained(pretrained_name) # Now load with local_files_only=True tokenizer_local = AutoTokenizer.from_pretrained(pretrained_name, local_files_only=True) # Check that the two tokenizers are identical self.assertEqual(tokenizer_cached.get_vocab(), tokenizer_local.get_vocab()) self.assertEqual( tokenizer_cached.all_special_tokens_extended, tokenizer_local.all_special_tokens_extended, ) except Exception as _: pass # if the pretrained model is not loadable how could it pass locally :) @require_tokenizers class TokenizersBackendCommonTest(TokenizersBackendTesterMixin, unittest.TestCase): """ A single test class that runs all tokenizers-backend tests once. Uses BertTokenizer as a representative tokenizer. """ tokenizer_class = BertTokenizer rust_tokenizer_class = BertTokenizerFast from_pretrained_id = "google-bert/bert-base-uncased" from_pretrained_kwargs = {} class SentencePieceBackendCommonTest(unittest.TestCase, SentencePieceBackendTesterMixin): """ A single test class that runs all SentencePiece-backend tests once. Uses T5Tokenizer as a representative SentencePiece tokenizer. """ tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast test_slow_tokenizer = True test_rust_tokenizer = True from_pretrained_id = "google-t5/t5-base" from_pretrained_kwargs = {"use_fast": False} def test_add_tokens(self): tokenizer_r = self.get_rust_tokenizer() vocab_size = len(tokenizer_r) self.assertEqual(tokenizer_r.add_tokens(""), 0) self.assertEqual(tokenizer_r.add_tokens("testoken"), 1) self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2) self.assertEqual(len(tokenizer_r), vocab_size + 3) self.assertEqual(tokenizer_r.add_special_tokens({}), 0) self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2) self.assertRaises(ValueError, tokenizer_r.add_special_tokens, {"additional_special_tokens": ""}) self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": [""]}), 1) self.assertEqual( tokenizer_r.add_special_tokens({"additional_special_tokens": ["", ""]}), 2 ) added_vocab = tokenizer_r.get_added_vocab() self.assertIn("", added_vocab) def test_add_tokens_tokenizer(self): tokenizer = self.get_tokenizer(do_lower_case=False) vocab_size = tokenizer.vocab_size all_size = len(tokenizer) new_toks = [ AddedToken("newtokenone", rstrip=False, lstrip=False), AddedToken("newtokentwo", rstrip=False, lstrip=False), ] added_toks = tokenizer.add_tokens(new_toks) vocab_size_2 = tokenizer.vocab_size all_size_2 = len(tokenizer) self.assertEqual(vocab_size, vocab_size_2) self.assertEqual(added_toks, len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks)) tokens = tokenizer.encode("newtokenone words newtokentwo", add_special_tokens=False) self.assertGreaterEqual(len(tokens), 3) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[-1], tokenizer.vocab_size - 1) new_specials = { "eos_token": AddedToken("<|eos_new|>", rstrip=False, lstrip=False), "pad_token": AddedToken("<|pad_new|>", rstrip=False, lstrip=False), } added_specials = tokenizer.add_special_tokens(new_specials) all_size_3 = len(tokenizer) self.assertEqual(added_specials, len(new_specials)) self.assertEqual(all_size_3, all_size_2 + len(new_specials)) tokens = tokenizer.encode("<|eos_new|> newtokenone <|pad_new|>", add_special_tokens=False) self.assertEqual(tokens[0], tokenizer.eos_token_id) self.assertEqual(tokens[-1], tokenizer.pad_token_id) def test_alignment_methods(self): self.skipTest("SentencePiece fast tokenizers do not expose token alignment metadata.") def test_local_files_only(self): from transformers import AutoTokenizer pretrained_list = getattr(self, "from_pretrained_id", []) or [] for pretrained_name in pretrained_list: with self.subTest(f"AutoTokenizer ({pretrained_name})"): # First cache the tokenizer files try: tokenizer_cached = AutoTokenizer.from_pretrained(pretrained_name) # Now load with local_files_only=True tokenizer_local = AutoTokenizer.from_pretrained(pretrained_name, local_files_only=True) # Check that the two tokenizers are identical self.assertEqual(tokenizer_cached.get_vocab(), tokenizer_local.get_vocab()) self.assertEqual( tokenizer_cached.all_special_tokens_extended, tokenizer_local.all_special_tokens_extended, ) except Exception as _: pass # if the pretrained model is not loadable how could it pass locally :)