QED-75M_artifacts / data /tokenizer /tokenizer_summary.json
levossadtchi's picture
Add files using upload-large-folder tool
9847679 verified
{
"vocab_size": 49152,
"special_tokens": {
"pad_token": "<pad>",
"bos_token": "<bos>",
"eos_token": "<eos>",
"unk_token": "<unk>",
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
"unk_token_id": 3
},
"data_config": {
"sources": [
{
"name": "fineweb_edu",
"path": "HuggingFaceFW/fineweb-edu",
"split": "train",
"weight": 0.6,
"text_field": "text",
"config_name": "sample-10BT",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 10000,
"sample_documents": null
},
{
"name": "cosmopedia_v2",
"path": "HuggingFaceTB/smollm-corpus",
"split": "train",
"weight": 0.2,
"text_field": "text",
"config_name": "cosmopedia-v2",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 10000,
"sample_documents": null
},
{
"name": "the_stack_python",
"path": "bigcode/the-stack-dedup",
"split": "train",
"weight": 0.1,
"text_field": "content",
"config_name": null,
"data_dir": "data/python",
"revision": null,
"streaming": true,
"shuffle_buffer": 2000,
"sample_documents": null
},
{
"name": "finemath",
"path": "HuggingFaceTB/finemath",
"split": "train",
"weight": 0.1,
"text_field": "text",
"config_name": "finemath-4plus",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 5000,
"sample_documents": null
}
],
"tokenizer_sample_documents": 2000000,
"tokenizer_min_frequency": 2,
"tokenizer_special_tokens": [
"<pad>",
"<bos>",
"<eos>",
"<unk>"
],
"train_tokens": 10000000000,
"val_tokens": 20000000,
"shard_size_tokens": 100000000
}
}