{ "version": "10B", "num_docs": 15368808, "num_val_docs": 50000, "max_train_tokens": 8000000000, "shuffle_seed": null, "shard_size": 100000000, "append_eos": false, "docs_jsonl": "docs_selected.jsonl", "docs_meta": { "remote_repo_id": "willdepueoai/parameter-golf", "remote_root": "datasets", "num_docs": 15368808, "docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7", "source_manifest": "docs_selected.source_manifest.json", "source_sidecar": { "source_export_root": "/root/exports/fineweb_50Bsub100B_50keval_v0", "snapshot_kind": "partial_docs_cache_from_50B_export", "note": "not canonical 10B shard selection; train split is a paused snapshot of the 50B shuffled train stream", "selection_seed": 1337, "num_val_docs": 50000, "num_docs": 15368808, "docs_val": 50000, "docs_train": 15318808, "docs_bytes": 48166275520, "docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7" } }, "tokenizer_specs": [ { "name": "sp_bpe_8192", "dataset_suffix": "sp8192", "vocab_size": 8192, "tokenizer_train_docs": 50000 } ], "tokenizers": [ { "name": "sp_bpe_8192", "kind": "sentencepiece_bpe", "vocab_size": 8192, "bos_id": 1, "eos_id": 2, "recommended_bigram_vocab_size": 40960, "source_spec": { "name": "sp_bpe_8192", "dataset_suffix": "sp8192", "vocab_size": 8192, "tokenizer_train_docs": 50000 }, "model_path": "tokenizers/fineweb_8192_bpe.model", "vocab_path": "tokenizers/fineweb_8192_bpe.vocab" } ], "datasets": [ { "name": "fineweb10B_sp8192", "tokenizer_name": "sp_bpe_8192", "tokenizer_kind": "sentencepiece_bpe", "path": "datasets/fineweb10B_sp8192", "train_glob": "datasets/fineweb10B_sp8192/fineweb_train_*.bin", "val_glob": "datasets/fineweb10B_sp8192/fineweb_val_*.bin", "vocab_size": 8192, "bos_id": 1, "eos_id": 2, "recommended_bigram_vocab_size": 40960, "stats": { "docs_total": 9682720, "docs_val": 50000, "docs_train": 9632720, "files_total": 81, "files_val": 1, "files_train": 80, "tokens_total": 8040547886, "tokens_val": 40547886, "tokens_train": 8000000000 } } ] }