| from datasets import load_from_disk | |
| from transformers import AutoTokenizer | |
| dataset = load_from_disk("/researchdisk/training_dataset_full_deduplicated") | |
| dataset = dataset["train"] | |
| # We train on batch of texts, 1000 at a time here. | |
| batch_size = 1000 | |
| corpus = (dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)) | |
| # ConvBERT uses Bert tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
| #let's use same vocab size as in Finnish-NLP/roberta-large-finnish-v2 which is also very close to TurkuNLP/bert-base-finnish-cased-v1 | |
| new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=50265) | |
| new_tokenizer.save_pretrained("./") | |