| | import datasets |
| |
|
| | from t5_tokenizer_model import SentencePieceUnigramTokenizer |
| |
|
| | vocab_size = 32_000 |
| | input_sentence_size = None |
| | model_dir = "." |
| |
|
| | |
| | dataset = datasets.load_dataset("oscar", name="unshuffled_deduplicated_sv", split="train") |
| |
|
| | tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>") |
| |
|
| | |
| | def batch_iterator(input_sentence_size=None): |
| | if input_sentence_size is None: |
| | input_sentence_size = len(dataset) |
| | batch_length = 100 |
| | for i in range(0, input_sentence_size, batch_length): |
| | yield dataset[i: i + batch_length]["text"] |
| |
|
| |
|
| | |
| | tokenizer.train_from_iterator( |
| | iterator=batch_iterator(input_sentence_size=input_sentence_size), |
| | vocab_size=vocab_size, |
| | show_progress=True, |
| | ) |
| |
|
| | |
| | tokenizer.save(f"{model_dir}/tokenizer.json") |