Spaces:
Sleeping
Sleeping
| import csv | |
| from tokenizers import models, Tokenizer, normalizers, pre_tokenizers, decoders, trainers, processors | |
| def text_iterator(file_path): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| text = row['0'] + " " + row['1'] | |
| yield text | |
| tokenizer = Tokenizer(models.BPE(unk_token="[UNK]")) | |
| tokenizer.normalizer = normalizers.Sequence([ | |
| normalizers.NFKC() | |
| ]) | |
| tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() | |
| tokenizer.decoder = decoders.ByteLevel() | |
| trainer = trainers.BpeTrainer( | |
| vocab_size=30_000, | |
| min_frequency=2, | |
| special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], | |
| ) | |
| tokenizer.train_from_iterator(text_iterator("data/wmt_zh_en_training_corpus.csv"), trainer=trainer) | |
| tokenizer.post_processor = processors.TemplateProcessing( | |
| single="[SOS] $A [EOS]", | |
| pair="[SOS] $A [EOS] $B [EOS]", | |
| special_tokens=[ | |
| ("[SOS]", tokenizer.token_to_id("[SOS]")), | |
| ("[EOS]", tokenizer.token_to_id("[EOS]")), | |
| ], | |
| ) | |
| tokenizer.save("checkpoints/tokenizer.json") |