literallybannedfromcallingbob's picture
Upload folder using huggingface_hub
586d30b verified
from datasets import load_dataset
from tokenizers import (
Tokenizer, models, normalizers, pre_tokenizers,
decoders, processors, trainers
)
#Load data and prepare iterator
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
def get_training_corpus():
for i in range(0, len(dataset), 1000):
yield dataset[i:i+1000]['text']
#Initialize tokenizer and core components
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([
normalizers.NFD(),
normalizers.Lowercase(),
normalizers.StripAccents()
])
pre_tok = pre_tokenizers.Sequence([
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Punctuation()
])
tokenizer.pre_tokenizer = pre_tok
tokenizer.decoder = decoders.WordPiece(prefix="##")
#Enable padding & truncation (works once [PAD] is in vocab post-train too)
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=128)
tokenizer.enable_truncation(max_length=128)
#Train the tokenizer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
#fetch special token IDs
cls_id = tokenizer.token_to_id("[CLS]")
sep_id = tokenizer.token_to_id("[SEP]")
#Attach post-processor with a dict mapping
tokenizer.post_processor = processors.TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B [SEP]",
special_tokens=[ ("[CLS]", cls_id),
("[SEP]", sep_id)
]
)
#Save and test
tokenizer.save("tokenizer.json")
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)