|
|
from datasets import load_dataset
|
|
|
from tokenizers import (
|
|
|
Tokenizer, models, normalizers, pre_tokenizers,
|
|
|
decoders, processors, trainers
|
|
|
)
|
|
|
|
|
|
|
|
|
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
|
|
|
def get_training_corpus():
|
|
|
for i in range(0, len(dataset), 1000):
|
|
|
yield dataset[i:i+1000]['text']
|
|
|
|
|
|
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
|
|
|
tokenizer.normalizer = normalizers.Sequence([
|
|
|
normalizers.NFD(),
|
|
|
normalizers.Lowercase(),
|
|
|
normalizers.StripAccents()
|
|
|
])
|
|
|
pre_tok = pre_tokenizers.Sequence([
|
|
|
pre_tokenizers.WhitespaceSplit(),
|
|
|
pre_tokenizers.Punctuation()
|
|
|
])
|
|
|
tokenizer.pre_tokenizer = pre_tok
|
|
|
tokenizer.decoder = decoders.WordPiece(prefix="##")
|
|
|
|
|
|
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=128)
|
|
|
tokenizer.enable_truncation(max_length=128)
|
|
|
|
|
|
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
|
|
|
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
|
|
|
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
|
|
|
|
|
|
cls_id = tokenizer.token_to_id("[CLS]")
|
|
|
sep_id = tokenizer.token_to_id("[SEP]")
|
|
|
|
|
|
tokenizer.post_processor = processors.TemplateProcessing(
|
|
|
single="[CLS] $A [SEP]",
|
|
|
pair="[CLS] $A [SEP] $B [SEP]",
|
|
|
special_tokens=[ ("[CLS]", cls_id),
|
|
|
("[SEP]", sep_id)
|
|
|
]
|
|
|
)
|
|
|
|
|
|
|
|
|
tokenizer.save("tokenizer.json")
|
|
|
encoding = tokenizer.encode("Let's test this tokenizer.")
|
|
|
print(encoding.tokens)
|
|
|
|