File size: 1,994 Bytes
9847679 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | {
"vocab_size": 49152,
"special_tokens": {
"pad_token": "<pad>",
"bos_token": "<bos>",
"eos_token": "<eos>",
"unk_token": "<unk>",
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
"unk_token_id": 3
},
"data_config": {
"sources": [
{
"name": "fineweb_edu",
"path": "HuggingFaceFW/fineweb-edu",
"split": "train",
"weight": 0.6,
"text_field": "text",
"config_name": "sample-10BT",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 10000,
"sample_documents": null
},
{
"name": "cosmopedia_v2",
"path": "HuggingFaceTB/smollm-corpus",
"split": "train",
"weight": 0.2,
"text_field": "text",
"config_name": "cosmopedia-v2",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 10000,
"sample_documents": null
},
{
"name": "the_stack_python",
"path": "bigcode/the-stack-dedup",
"split": "train",
"weight": 0.1,
"text_field": "content",
"config_name": null,
"data_dir": "data/python",
"revision": null,
"streaming": true,
"shuffle_buffer": 2000,
"sample_documents": null
},
{
"name": "finemath",
"path": "HuggingFaceTB/finemath",
"split": "train",
"weight": 0.1,
"text_field": "text",
"config_name": "finemath-4plus",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 5000,
"sample_documents": null
}
],
"tokenizer_sample_documents": 2000000,
"tokenizer_min_frequency": 2,
"tokenizer_special_tokens": [
"<pad>",
"<bos>",
"<eos>",
"<unk>"
],
"train_tokens": 10000000000,
"val_tokens": 20000000,
"shard_size_tokens": 100000000
}
} |