| Toy Wordlevel Tokenizer created for testing. | |
| Code used for its creation: | |
| ``` | |
| from tokenizers import Tokenizer, normalizers, pre_tokenizers | |
| from tokenizers.models import WordLevel | |
| from tokenizers.normalizers import NFD, Lowercase, StripAccents | |
| from tokenizers.pre_tokenizers import Digits, Whitespace | |
| from tokenizers.processors import TemplateProcessing | |
| from tokenizers.trainers import WordLevelTrainer | |
| SMALL_TRAINING_CORPUS = [ | |
| ["This is the first sentence.", "This is the second one."], | |
| ["This sentence (contains #) over symbols and numbers 12 3.", "But not this one."], | |
| ] | |
| tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) | |
| tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) | |
| tokenizer.pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)]) | |
| tokenizer.post_processor = TemplateProcessing( | |
| single="[CLS] $A [SEP]", | |
| pair="[CLS] $A [SEP] $B:1 [SEP]:1", | |
| special_tokens=[ | |
| ("[CLS]", 1), | |
| ("[SEP]", 2), | |
| ], | |
| ) | |
| trainer = WordLevelTrainer(vocab_size=100, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) | |
| tokenizer.train_from_iterator(SMALL_TRAINING_CORPUS, trainer=trainer) | |
| tokenizer.save("tokenizer.json") | |
| ``` | |
| ``` | |
| from transformers import PreTrainedTokenizerFast | |
| tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_file="tokenizer.json", | |
| bos_token="[CLS]", | |
| eos_token="[SEP]", | |
| unk_token="[UNK]", | |
| sep_token="[SEP]", | |
| pad_token="[PAD]", | |
| cls_token= "[CLS]", | |
| mask_token="[MASK]", | |
| model_max_length=10, | |
| padding_side="right" | |
| ) | |
| tokenizer.push_to_hub('dummy-tokenizer-wordlevel', commit_message="add tokenizer") | |
| ``` |