| { |
| "tokenizer_type": "ByteLevel BPE", |
| "vocab_size": 32000, |
| "target_texts": 700000, |
| "min_chars": 80, |
| "max_chars": 12000, |
| "datasets": [ |
| { |
| "dataset": "allenai/c4", |
| "config": "en", |
| "split": "train", |
| "column": "text", |
| "share": 0.5, |
| "target_texts": 350000 |
| }, |
| { |
| "dataset": "HuggingFaceFW/fineweb-edu", |
| "config": null, |
| "split": "train", |
| "column": "text", |
| "share": 0.2, |
| "target_texts": 140000 |
| }, |
| { |
| "dataset": "wikimedia/wikipedia", |
| "config": "20231101.en", |
| "split": "train", |
| "column": "text", |
| "share": 0.1, |
| "target_texts": 70000 |
| }, |
| { |
| "dataset": "codeparrot/codeparrot-clean", |
| "config": null, |
| "split": "train", |
| "column": "content", |
| "share": 0.1, |
| "target_texts": 70000 |
| }, |
| { |
| "dataset": "allenai/c4", |
| "config": "es", |
| "split": "train", |
| "column": "text", |
| "share": 0.1, |
| "target_texts": 70000 |
| } |
| ], |
| "elapsed_seconds": 209.82311129570007, |
| "elapsed_minutes": 3.4970518549283347, |
| "output_dir": "/out/tokenizer-bpe-32k" |
| } |