Byte-lingua-code / superbpe /tokenizers_superbpe /bindings /python /tests /bindings /test_tokenizer.py
| import pickle | |
| import numpy as np | |
| import pytest | |
| from tokenizers import AddedToken, Encoding, Tokenizer | |
| from tokenizers.implementations import BertWordPieceTokenizer | |
| from tokenizers.models import BPE, Model, Unigram | |
| from tokenizers.pre_tokenizers import ByteLevel, Metaspace | |
| from tokenizers.processors import RobertaProcessing, TemplateProcessing | |
| from tokenizers.normalizers import Strip, Lowercase, Sequence | |
| from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files | |
| class TestAddedToken: | |
| def test_instantiate_with_content_only(self): | |
| added_token = AddedToken("<mask>") | |
| added_token.content = "<MASK>" | |
| assert added_token.content == "<MASK>" | |
| assert type(added_token) == AddedToken | |
| added_token.content = added_token.content.lower() | |
| assert added_token.special == False | |
| added_token.special = True | |
| assert added_token.special == True | |
| added_token.special = False | |
| assert str(added_token) == "<mask>" | |
| assert ( | |
| repr(added_token) | |
| == 'AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False)' | |
| ) | |
| assert added_token.rstrip == False | |
| assert added_token.lstrip == False | |
| assert added_token.single_word == False | |
| assert added_token.normalized == True | |
| assert isinstance(pickle.loads(pickle.dumps(added_token)), AddedToken) | |
| def test_can_set_rstrip(self): | |
| added_token = AddedToken("<mask>", rstrip=True) | |
| assert added_token.rstrip == True | |
| assert added_token.lstrip == False | |
| assert added_token.single_word == False | |
| assert added_token.normalized == True | |
| def test_can_set_lstrip(self): | |
| added_token = AddedToken("<mask>", lstrip=True) | |
| assert added_token.rstrip == False | |
| assert added_token.lstrip == True | |
| assert added_token.single_word == False | |
| assert added_token.normalized == True | |
| def test_can_set_single_world(self): | |
| added_token = AddedToken("<mask>", single_word=True) | |
| assert added_token.rstrip == False | |
| assert added_token.lstrip == False | |
| assert added_token.single_word == True | |
| assert added_token.normalized == True | |
| def test_can_set_normalized(self): | |
| added_token = AddedToken("<mask>", normalized=False) | |
| assert added_token.rstrip == False | |
| assert added_token.lstrip == False | |
| assert added_token.single_word == False | |
| assert added_token.normalized == False | |
| class TestTokenizer: | |
| def test_has_expected_type_and_methods(self): | |
| tokenizer = Tokenizer(BPE()) | |
| assert type(tokenizer) == Tokenizer | |
| assert callable(tokenizer.num_special_tokens_to_add) | |
| assert callable(tokenizer.get_vocab) | |
| assert callable(tokenizer.get_vocab_size) | |
| assert callable(tokenizer.enable_truncation) | |
| assert callable(tokenizer.no_truncation) | |
| assert callable(tokenizer.enable_padding) | |
| assert callable(tokenizer.no_padding) | |
| assert callable(tokenizer.encode) | |
| assert callable(tokenizer.encode_batch) | |
| assert callable(tokenizer.decode) | |
| assert callable(tokenizer.decode_batch) | |
| assert callable(tokenizer.token_to_id) | |
| assert callable(tokenizer.id_to_token) | |
| assert callable(tokenizer.add_tokens) | |
| assert callable(tokenizer.add_special_tokens) | |
| assert callable(tokenizer.train) | |
| assert callable(tokenizer.post_process) | |
| assert isinstance(tokenizer.model, Model) | |
| assert tokenizer.normalizer is None | |
| assert tokenizer.pre_tokenizer is None | |
| assert tokenizer.post_processor is None | |
| assert tokenizer.decoder is None | |
| assert isinstance(pickle.loads(pickle.dumps(Tokenizer(BPE()))), Tokenizer) | |
| def test_add_tokens(self): | |
| tokenizer = Tokenizer(BPE()) | |
| added = tokenizer.add_tokens(["my", "name", "is", "john"]) | |
| assert added == 4 | |
| tokens = [AddedToken("the"), AddedToken("quick", normalized=False), AddedToken()] | |
| assert tokens[0].normalized == True | |
| added = tokenizer.add_tokens(tokens) | |
| assert added == 2 | |
| assert tokens[0].normalized == True | |
| assert tokens[1].normalized == False | |
| def test_add_special_tokens(self): | |
| tokenizer = Tokenizer(BPE()) | |
| # Can add special tokens as `str` | |
| added = tokenizer.add_special_tokens(["my", "name", "is", "john"]) | |
| assert added == 4 | |
| # Can add special tokens as `AddedToken` | |
| tokens = [AddedToken("the"), AddedToken("quick", normalized=True), AddedToken()] | |
| assert tokens[0].normalized == True | |
| added = tokenizer.add_special_tokens(tokens) | |
| assert added == 2 | |
| assert tokens[0].normalized == False | |
| assert tokens[1].normalized == True | |
| def test_encode(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) | |
| # Can encode single sequence | |
| output = tokenizer.encode("my name is john") | |
| assert output.tokens == ["my", "name", "is", "john"] | |
| assert type(output.ids) == list | |
| assert type(output.type_ids) == list | |
| assert type(output.offsets) == list | |
| with pytest.warns(DeprecationWarning): | |
| assert type(output.words) == list | |
| assert type(output.word_ids) == list | |
| assert type(output.special_tokens_mask) == list | |
| assert type(output.attention_mask) == list | |
| assert type(output.overflowing) == list | |
| # Can encode a pair of sequences | |
| output = tokenizer.encode("my name is john", "pair") | |
| assert output.tokens == ["my", "name", "is", "john", "pair"] | |
| assert isinstance(pickle.loads(pickle.dumps(output)), Encoding) | |
| # Can encode a single pre-tokenized sequence | |
| output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) | |
| assert output.tokens == ["my", "name", "is", "john"] | |
| # Can encode a batch with both a single sequence and a pair of sequences | |
| output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) | |
| assert len(output) == 2 | |
| def test_encode_formats(self, bert_files): | |
| print("Broken by the change from std::usize::Max to usixeMax") | |
| return 0 | |
| with pytest.deprecated_call(): | |
| tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) | |
| # Encode | |
| output = tokenizer.encode("my name is john") | |
| assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] | |
| output = tokenizer.encode("my name is john", "pair") | |
| assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] | |
| output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) | |
| assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] | |
| output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True) | |
| assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] | |
| # Encode batch | |
| result_single = [ | |
| ["[CLS]", "my", "name", "is", "john", "[SEP]"], | |
| ["[CLS]", "my", "name", "is", "georges", "[SEP]"], | |
| ] | |
| result_pair = [ | |
| ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"], | |
| ["[CLS]", "my", "name", "is", "georges", "[SEP]", "pair", "[SEP]"], | |
| ] | |
| def format(encodings): | |
| return [e.tokens for e in encodings] | |
| def test_single(input, is_pretokenized=False): | |
| output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized) | |
| assert format(output) == result_single | |
| def test_pair(input, is_pretokenized=False): | |
| output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized) | |
| assert format(output) == result_pair | |
| # Classic inputs | |
| # Lists | |
| test_single(["My name is John", "My name is Georges"]) | |
| test_pair([("my name is john", "pair"), ("my name is georges", "pair")]) | |
| test_pair([["my name is john", "pair"], ["my name is georges", "pair"]]) | |
| # Tuples | |
| test_single(("My name is John", "My name is Georges")) | |
| test_pair((("My name is John", "pair"), ("My name is Georges", "pair"))) | |
| # Numpy | |
| test_single(np.array(["My name is John", "My name is Georges"])) | |
| test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")])) | |
| test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]])) | |
| # PreTokenized inputs | |
| # Lists | |
| test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True) | |
| test_pair( | |
| [ | |
| (["My", "name", "is", "John"], ["pair"]), | |
| (["My", "name", "is", "Georges"], ["pair"]), | |
| ], | |
| True, | |
| ) | |
| test_pair( | |
| [ | |
| [["My", "name", "is", "John"], ["pair"]], | |
| [["My", "name", "is", "Georges"], ["pair"]], | |
| ], | |
| True, | |
| ) | |
| # Tuples | |
| test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True) | |
| test_pair( | |
| ( | |
| (("My", "name", "is", "John"), ("pair",)), | |
| (("My", "name", "is", "Georges"), ("pair",)), | |
| ), | |
| True, | |
| ) | |
| test_pair( | |
| ( | |
| (["My", "name", "is", "John"], ["pair"]), | |
| (["My", "name", "is", "Georges"], ["pair"]), | |
| ), | |
| True, | |
| ) | |
| # Numpy | |
| test_single( | |
| np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), | |
| True, | |
| ) | |
| test_single( | |
| np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), | |
| True, | |
| ) | |
| test_pair( | |
| np.array( | |
| [ | |
| [["My", "name", "is", "John"], ["pair"]], | |
| [["My", "name", "is", "Georges"], ["pair"]], | |
| ], | |
| dtype=object, | |
| ), | |
| True, | |
| ) | |
| test_pair( | |
| np.array( | |
| ( | |
| (("My", "name", "is", "John"), ("pair",)), | |
| (("My", "name", "is", "Georges"), ("pair",)), | |
| ), | |
| dtype=object, | |
| ), | |
| True, | |
| ) | |
| # Mal formed | |
| with pytest.raises(TypeError, match="TextInputSequence must be str"): | |
| tokenizer.encode([["my", "name"]]) | |
| with pytest.raises(TypeError, match="TextInputSequence must be str"): | |
| tokenizer.encode("My name is john", [["pair"]]) | |
| with pytest.raises(TypeError, match="TextInputSequence must be str"): | |
| tokenizer.encode("my name is john", ["pair"]) | |
| with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): | |
| tokenizer.encode("My name is john", is_pretokenized=True) | |
| with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): | |
| tokenizer.encode("My name is john", ["pair"], is_pretokenized=True) | |
| with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): | |
| tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True) | |
| def test_encode_add_special_tokens(self, roberta_files): | |
| with pytest.deprecated_call(): | |
| tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) | |
| tokenizer.add_special_tokens(["<s>", "</s>"]) | |
| tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) | |
| tokenizer.post_processor = RobertaProcessing( | |
| ("</s>", tokenizer.token_to_id("</s>")), | |
| ("<s>", tokenizer.token_to_id("<s>")), | |
| ) | |
| # Can encode with special tokens | |
| output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) | |
| assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"] | |
| # Can encode without special tokens | |
| output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) | |
| assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] | |
| def test_truncation(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) | |
| tokenizer.enable_truncation(2) | |
| # Can truncate single sequences | |
| output = tokenizer.encode("my name is john") | |
| assert output.tokens == ["my", "name"] | |
| # Can truncate pair sequences as well | |
| output = tokenizer.encode("my name is john", "pair") | |
| assert output.tokens == ["my", "pair"] | |
| # Can get the params and give them to enable_truncation | |
| trunc = tokenizer.truncation | |
| tokenizer.enable_truncation(**trunc) | |
| # Left truncation direction | |
| tokenizer.enable_truncation(2, direction="left") | |
| output = tokenizer.encode("my name is john") | |
| assert output.tokens == ["is", "john"] | |
| output = tokenizer.encode("my name is john", "pair") | |
| assert output.tokens == ["john", "pair"] | |
| def test_padding(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) | |
| # By default it does nothing when encoding single sequence | |
| tokenizer.enable_padding() | |
| output = tokenizer.encode("my name") | |
| assert output.tokens == ["my", "name"] | |
| # Can pad to the longest in a batch | |
| output = tokenizer.encode_batch(["my name", "my name is john"]) | |
| assert all([len(encoding) == 4 for encoding in output]) | |
| # Can pad to the specified length otherwise | |
| tokenizer.enable_padding(length=4) | |
| output = tokenizer.encode("my name") | |
| assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] | |
| output = tokenizer.encode("my name", "pair") | |
| assert output.tokens == ["my", "name", "pair", "[PAD]"] | |
| # Can get the params and give them to enable_padding | |
| padding = tokenizer.padding | |
| tokenizer.enable_padding(**padding) | |
| def test_decode(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) | |
| # Can decode single sequences | |
| output = tokenizer.decode([0, 1, 2, 3]) | |
| assert output == "my name is john" | |
| # Can decode batch | |
| output = tokenizer.decode_batch([[0, 1, 2, 3], [4]]) | |
| assert output == ["my name is john", "pair"] | |
| def test_get_vocab(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) | |
| # Can retrieve vocab with added tokens | |
| vocab = tokenizer.get_vocab(with_added_tokens=True) | |
| assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4} | |
| # Can retrieve vocab without added tokens | |
| vocab = tokenizer.get_vocab(with_added_tokens=False) | |
| assert vocab == {} | |
| # Can retrieve added token decoder | |
| vocab = tokenizer.get_added_tokens_decoder() | |
| assert vocab == { | |
| 0: AddedToken("my", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False), | |
| 1: AddedToken("name", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False), | |
| 2: AddedToken("is", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False), | |
| 3: AddedToken("john", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False), | |
| 4: AddedToken("pair", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False), | |
| } | |
| def test_get_vocab_size(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) | |
| # Can retrieve vocab's size with added tokens | |
| size = tokenizer.get_vocab_size(with_added_tokens=True) | |
| assert size == 5 | |
| # Can retrieve vocab's size without added tokens | |
| size = tokenizer.get_vocab_size(with_added_tokens=False) | |
| assert size == 0 | |
| def test_post_process(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) | |
| tokenizer.enable_truncation(2) | |
| tokenizer.enable_padding(length=4) | |
| encoding = tokenizer.encode("my name is john") | |
| pair_encoding = tokenizer.encode("pair") | |
| # Can post process a single encoding | |
| output = tokenizer.post_process(encoding) | |
| assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] | |
| # Can post process a pair of encodings | |
| output = tokenizer.post_process(encoding, pair_encoding) | |
| assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"] | |
| def test_multiprocessing_with_parallelism(self): | |
| tokenizer = Tokenizer(BPE()) | |
| multiprocessing_with_parallelism(tokenizer, False) | |
| multiprocessing_with_parallelism(tokenizer, True) | |
| def test_from_pretrained(self): | |
| tokenizer = Tokenizer.from_pretrained("bert-base-cased") | |
| output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False) | |
| assert output.tokens == ["Hey", "there", "dear", "friend", "!"] | |
| def test_from_pretrained_revision(self): | |
| tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test") | |
| output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False) | |
| assert output.tokens == ["hey", "there", "dear", "friend", "!"] | |
| tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test", revision="gpt-2") | |
| output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False) | |
| assert output.tokens == ["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"] | |
| def test_unigram_byte_fallback(self): | |
| vocab = [ | |
| ("<unk>", 0.0), | |
| ("A", -0.01), | |
| ("sen", -0.02), | |
| ("te", -0.03), | |
| ("n", -0.04), | |
| ("ce", -0.05), | |
| ("<0xF0>", -0.06), | |
| ("<0x9F>", -0.06), | |
| ("<0xA4>", -0.06), | |
| ("<0x97>", -0.06), | |
| (" ", -0.4), | |
| ] | |
| tokenizer = tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=False)) | |
| output = tokenizer.encode("A sentence 🤗") | |
| assert output.ids == [1, 10, 2, 3, 4, 5, 10, 0] | |
| assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "🤗"] | |
| tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=True)) | |
| output = tokenizer.encode("A sentence 🤗") | |
| assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9] | |
| assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"] | |
| def test_encode_special_tokens(self): | |
| tokenizer = Tokenizer.from_pretrained("t5-base") | |
| tokenizer.add_tokens(["<eot>"]) | |
| tokenizer.add_special_tokens(["<end_of_text>"]) | |
| output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False) | |
| assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"] | |
| tokenizer.encode_special_tokens = True | |
| assert tokenizer.encode_special_tokens == True | |
| output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False) | |
| assert output.tokens == [ | |
| "▁Hey", | |
| "▁there", | |
| "<", | |
| "end", | |
| "_", | |
| "of", | |
| "_", | |
| "text", | |
| ">", | |
| "▁dear", | |
| "<eot>", | |
| "▁friend", | |
| "!", | |
| ] | |
| tokenizer.add_tokens(["of_text>"]) | |
| output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False) | |
| assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"] | |
| def test_splitting(self): | |
| tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-new-metaspace") | |
| tokenizer.pre_tokenizer.split = False | |
| tokenizer.add_tokens([AddedToken("<REPR_END>", rstrip=True, lstrip=True)]) | |
| assert tokenizer.encode("<REPR_END>inform<s>. Hey. .", add_special_tokens=False).tokens == [ | |
| "<REPR_END>", | |
| "in", | |
| "form", | |
| "<s>", | |
| ".", | |
| "▁Hey", | |
| ".", | |
| "▁▁▁▁▁▁", | |
| "▁.", | |
| ] | |
| assert tokenizer.encode("<REPR_END>inform<s>. Hey. .", add_special_tokens=False).ids == [ | |
| 32000, | |
| 262, | |
| 689, | |
| 1, | |
| 29889, | |
| 18637, | |
| 29889, | |
| 539, | |
| 869, | |
| ] | |
| assert tokenizer.encode("inform<s>. Hey. .").tokens == [ | |
| "<s>", | |
| "▁inform", | |
| "<s>", | |
| ".", | |
| "▁Hey", | |
| ".", | |
| "▁▁▁▁▁▁", | |
| "▁.", | |
| ] | |
| assert tokenizer.encode("inform<s>. Hey. .", add_special_tokens=False).tokens == [ | |
| "▁inform", | |
| "<s>", | |
| ".", | |
| "▁Hey", | |
| ".", | |
| "▁▁▁▁▁▁", | |
| "▁.", | |
| ] | |
| def test_decode_special(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"]) | |
| # Can decode single sequences | |
| output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False) | |
| assert output == "my name is john" | |
| output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True) | |
| assert output == "name is john" | |
| assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True) | |
| def test_setting_to_none(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.normalizer = Strip() | |
| tokenizer.normalizer = None | |
| assert tokenizer.normalizer == None | |
| tokenizer.pre_tokenizer = Metaspace() | |
| tokenizer.pre_tokenizer = None | |
| assert tokenizer.pre_tokenizer == None | |
| class TestTokenizerRepr: | |
| def test_repr(self): | |
| tokenizer = Tokenizer(BPE()) | |
| out = repr(tokenizer) | |
| assert ( | |
| out | |
| == 'Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))' | |
| ) | |
| def test_repr_complete(self): | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) | |
| tokenizer.post_processor = TemplateProcessing( | |
| single=["[CLS]", "$0", "[SEP]"], | |
| pair=["[CLS]:0", "$A", "[SEP]:0", "$B:1", "[SEP]:1"], | |
| special_tokens=[("[CLS]", 1), ("[SEP]", 0)], | |
| ) | |
| tokenizer.normalizer = Sequence([Lowercase(), Strip()]) | |
| out = repr(tokenizer) | |
| assert ( | |
| out | |
| == 'Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=Sequence(normalizers=[Lowercase(), Strip(strip_left=True, strip_right=True)]), pre_tokenizer=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), post_processor=TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[1], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[0], tokens=["[SEP]"])}), decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))' | |
| ) | |