| { |
| "version": "10B", |
| "num_docs": 15368808, |
| "num_val_docs": 50000, |
| "max_train_tokens": 8000000000, |
| "shuffle_seed": null, |
| "shard_size": 100000000, |
| "append_eos": false, |
| "docs_jsonl": "docs_selected.jsonl", |
| "docs_meta": { |
| "remote_repo_id": "willdepueoai/parameter-golf", |
| "remote_root": "datasets", |
| "num_docs": 15368808, |
| "docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7", |
| "source_manifest": "docs_selected.source_manifest.json", |
| "source_sidecar": { |
| "source_export_root": "/root/exports/fineweb_50Bsub100B_50keval_v0", |
| "snapshot_kind": "partial_docs_cache_from_50B_export", |
| "note": "not canonical 10B shard selection; train split is a paused snapshot of the 50B shuffled train stream", |
| "selection_seed": 1337, |
| "num_val_docs": 50000, |
| "num_docs": 15368808, |
| "docs_val": 50000, |
| "docs_train": 15318808, |
| "docs_bytes": 48166275520, |
| "docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7" |
| } |
| }, |
| "tokenizer_specs": [ |
| { |
| "name": "sp_bpe_8192", |
| "dataset_suffix": "sp8192", |
| "vocab_size": 8192, |
| "tokenizer_train_docs": 50000 |
| } |
| ], |
| "tokenizers": [ |
| { |
| "name": "sp_bpe_8192", |
| "kind": "sentencepiece_bpe", |
| "vocab_size": 8192, |
| "bos_id": 1, |
| "eos_id": 2, |
| "recommended_bigram_vocab_size": 40960, |
| "source_spec": { |
| "name": "sp_bpe_8192", |
| "dataset_suffix": "sp8192", |
| "vocab_size": 8192, |
| "tokenizer_train_docs": 50000 |
| }, |
| "model_path": "tokenizers/fineweb_8192_bpe.model", |
| "vocab_path": "tokenizers/fineweb_8192_bpe.vocab" |
| } |
| ], |
| "datasets": [ |
| { |
| "name": "fineweb10B_sp8192", |
| "tokenizer_name": "sp_bpe_8192", |
| "tokenizer_kind": "sentencepiece_bpe", |
| "path": "datasets/fineweb10B_sp8192", |
| "train_glob": "datasets/fineweb10B_sp8192/fineweb_train_*.bin", |
| "val_glob": "datasets/fineweb10B_sp8192/fineweb_val_*.bin", |
| "vocab_size": 8192, |
| "bos_id": 1, |
| "eos_id": 2, |
| "recommended_bigram_vocab_size": 40960, |
| "stats": { |
| "docs_total": 9682720, |
| "docs_val": 50000, |
| "docs_train": 9632720, |
| "files_total": 81, |
| "files_val": 1, |
| "files_train": 80, |
| "tokens_total": 8040547886, |
| "tokens_val": 40547886, |
| "tokens_train": 8000000000 |
| } |
| } |
| ] |
| } |
|
|