{ "tokenizer_type": "ByteLevel BPE", "vocab_size": 32000, "target_texts": 700000, "min_chars": 80, "max_chars": 12000, "datasets": [ { "dataset": "allenai/c4", "config": "en", "split": "train", "column": "text", "share": 0.5, "target_texts": 350000 }, { "dataset": "HuggingFaceFW/fineweb-edu", "config": null, "split": "train", "column": "text", "share": 0.2, "target_texts": 140000 }, { "dataset": "wikimedia/wikipedia", "config": "20231101.en", "split": "train", "column": "text", "share": 0.1, "target_texts": 70000 }, { "dataset": "codeparrot/codeparrot-clean", "config": null, "split": "train", "column": "content", "share": 0.1, "target_texts": 70000 }, { "dataset": "allenai/c4", "config": "es", "split": "train", "column": "text", "share": 0.1, "target_texts": 70000 } ], "elapsed_seconds": 209.82311129570007, "elapsed_minutes": 3.4970518549283347, "output_dir": "/out/tokenizer-bpe-32k" }