| { | |
| "vocab_size": 49152, | |
| "special_tokens": { | |
| "pad_token": "<pad>", | |
| "bos_token": "<bos>", | |
| "eos_token": "<eos>", | |
| "unk_token": "<unk>", | |
| "pad_token_id": 0, | |
| "bos_token_id": 1, | |
| "eos_token_id": 2, | |
| "unk_token_id": 3 | |
| }, | |
| "data_config": { | |
| "sources": [ | |
| { | |
| "name": "fineweb_edu", | |
| "path": "HuggingFaceFW/fineweb-edu", | |
| "split": "train", | |
| "weight": 0.6, | |
| "text_field": "text", | |
| "config_name": "sample-10BT", | |
| "data_dir": null, | |
| "revision": null, | |
| "streaming": true, | |
| "shuffle_buffer": 10000, | |
| "sample_documents": null | |
| }, | |
| { | |
| "name": "cosmopedia_v2", | |
| "path": "HuggingFaceTB/smollm-corpus", | |
| "split": "train", | |
| "weight": 0.2, | |
| "text_field": "text", | |
| "config_name": "cosmopedia-v2", | |
| "data_dir": null, | |
| "revision": null, | |
| "streaming": true, | |
| "shuffle_buffer": 10000, | |
| "sample_documents": null | |
| }, | |
| { | |
| "name": "the_stack_python", | |
| "path": "bigcode/the-stack-dedup", | |
| "split": "train", | |
| "weight": 0.1, | |
| "text_field": "content", | |
| "config_name": null, | |
| "data_dir": "data/python", | |
| "revision": null, | |
| "streaming": true, | |
| "shuffle_buffer": 2000, | |
| "sample_documents": null | |
| }, | |
| { | |
| "name": "finemath", | |
| "path": "HuggingFaceTB/finemath", | |
| "split": "train", | |
| "weight": 0.1, | |
| "text_field": "text", | |
| "config_name": "finemath-4plus", | |
| "data_dir": null, | |
| "revision": null, | |
| "streaming": true, | |
| "shuffle_buffer": 5000, | |
| "sample_documents": null | |
| } | |
| ], | |
| "tokenizer_sample_documents": 2000000, | |
| "tokenizer_min_frequency": 2, | |
| "tokenizer_special_tokens": [ | |
| "<pad>", | |
| "<bos>", | |
| "<eos>", | |
| "<unk>" | |
| ], | |
| "train_tokens": 10000000000, | |
| "val_tokens": 20000000, | |
| "shard_size_tokens": 100000000 | |
| } | |
| } |