{ "vocab_size": 49152, "special_tokens": { "pad_token": "", "bos_token": "", "eos_token": "", "unk_token": "", "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2, "unk_token_id": 3 }, "data_config": { "sources": [ { "name": "fineweb_edu", "path": "HuggingFaceFW/fineweb-edu", "split": "train", "weight": 0.6, "text_field": "text", "config_name": "sample-10BT", "data_dir": null, "revision": null, "streaming": true, "shuffle_buffer": 10000, "sample_documents": null }, { "name": "cosmopedia_v2", "path": "HuggingFaceTB/smollm-corpus", "split": "train", "weight": 0.2, "text_field": "text", "config_name": "cosmopedia-v2", "data_dir": null, "revision": null, "streaming": true, "shuffle_buffer": 10000, "sample_documents": null }, { "name": "the_stack_python", "path": "bigcode/the-stack-dedup", "split": "train", "weight": 0.1, "text_field": "content", "config_name": null, "data_dir": "data/python", "revision": null, "streaming": true, "shuffle_buffer": 2000, "sample_documents": null }, { "name": "finemath", "path": "HuggingFaceTB/finemath", "split": "train", "weight": 0.1, "text_field": "text", "config_name": "finemath-4plus", "data_dir": null, "revision": null, "streaming": true, "shuffle_buffer": 5000, "sample_documents": null } ], "tokenizer_sample_documents": 2000000, "tokenizer_min_frequency": 2, "tokenizer_special_tokens": [ "", "", "", "" ], "train_tokens": 10000000000, "val_tokens": 20000000, "shard_size_tokens": 100000000 } }