Spaces:
Sleeping
Sleeping
File size: 1,091 Bytes
5153277 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import csv
from tokenizers import models, Tokenizer, normalizers, pre_tokenizers, decoders, trainers, processors
def text_iterator(file_path):
with open(file_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
text = row['0'] + " " + row['1']
yield text
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([
normalizers.NFKC()
])
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=30_000,
min_frequency=2,
special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
)
tokenizer.train_from_iterator(text_iterator("data/wmt_zh_en_training_corpus.csv"), trainer=trainer)
tokenizer.post_processor = processors.TemplateProcessing(
single="[SOS] $A [EOS]",
pair="[SOS] $A [EOS] $B [EOS]",
special_tokens=[
("[SOS]", tokenizer.token_to_id("[SOS]")),
("[EOS]", tokenizer.token_to_id("[EOS]")),
],
)
tokenizer.save("checkpoints/tokenizer.json") |