parameter-golf-tokenizers / manifest.json
sproos's picture
Upload folder using huggingface_hub
c5f9e16 verified
{
"version": "10B",
"num_docs": 15368808,
"num_val_docs": 50000,
"max_train_tokens": 8000000000,
"shuffle_seed": null,
"shard_size": 100000000,
"append_eos": false,
"docs_jsonl": "docs_selected.jsonl",
"docs_meta": {
"remote_repo_id": "willdepueoai/parameter-golf",
"remote_root": "datasets",
"num_docs": 15368808,
"docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7",
"source_manifest": "docs_selected.source_manifest.json",
"source_sidecar": {
"source_export_root": "/root/exports/fineweb_50Bsub100B_50keval_v0",
"snapshot_kind": "partial_docs_cache_from_50B_export",
"note": "not canonical 10B shard selection; train split is a paused snapshot of the 50B shuffled train stream",
"selection_seed": 1337,
"num_val_docs": 50000,
"num_docs": 15368808,
"docs_val": 50000,
"docs_train": 15318808,
"docs_bytes": 48166275520,
"docs_sha256": "84386dfa7b339a5d4831d5273c4a2028b78b60670d3a235633a8520545d19bc7"
}
},
"tokenizer_specs": [
{
"name": "sp_bpe_8192",
"dataset_suffix": "sp8192",
"vocab_size": 8192,
"tokenizer_train_docs": 50000
}
],
"tokenizers": [
{
"name": "sp_bpe_8192",
"kind": "sentencepiece_bpe",
"vocab_size": 8192,
"bos_id": 1,
"eos_id": 2,
"recommended_bigram_vocab_size": 40960,
"source_spec": {
"name": "sp_bpe_8192",
"dataset_suffix": "sp8192",
"vocab_size": 8192,
"tokenizer_train_docs": 50000
},
"model_path": "tokenizers/fineweb_8192_bpe.model",
"vocab_path": "tokenizers/fineweb_8192_bpe.vocab"
}
],
"datasets": [
{
"name": "fineweb10B_sp8192",
"tokenizer_name": "sp_bpe_8192",
"tokenizer_kind": "sentencepiece_bpe",
"path": "datasets/fineweb10B_sp8192",
"train_glob": "datasets/fineweb10B_sp8192/fineweb_train_*.bin",
"val_glob": "datasets/fineweb10B_sp8192/fineweb_val_*.bin",
"vocab_size": 8192,
"bos_id": 1,
"eos_id": 2,
"recommended_bigram_vocab_size": 40960,
"stats": {
"docs_total": 9682720,
"docs_val": 50000,
"docs_train": 9632720,
"files_total": 81,
"files_val": 1,
"files_train": 80,
"tokens_total": 8040547886,
"tokens_val": 40547886,
"tokens_train": 8000000000
}
}
]
}