Byte-lingua-code / superbpe /tokenizers_superbpe /bindings /python /examples /train_with_datasets.py
| import datasets | |
| from tokenizers import Tokenizer, models, normalizers, pre_tokenizers | |
| # Build a tokenizer | |
| bpe_tokenizer = Tokenizer(models.BPE()) | |
| bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() | |
| bpe_tokenizer.normalizer = normalizers.Lowercase() | |
| # Initialize a dataset | |
| dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train") | |
| # Build an iterator over this dataset | |
| def batch_iterator(): | |
| batch_size = 1000 | |
| for batch in dataset.iter(batch_size=batch_size): | |
| yield batch["text"] | |
| # And finally train | |
| bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset)) | |