QED-75M_artifacts / data /sft /processed /dataset_summary.json
levossadtchi's picture
Upload folder using huggingface_hub
af047e0 verified
{
"config": {
"val_examples": 2000,
"max_train_examples": 200000,
"min_supervised_tokens": 16,
"shuffle": true,
"format": "messages",
"messages_field": "messages",
"sources": [
{
"source_name": "smol_magpie_ultra",
"path": "HuggingFaceTB/smoltalk",
"config_name": "smol-magpie-ultra",
"split": "train",
"weight": 0.4,
"row_filters": {
"quality": "good"
}
},
{
"source_name": "openhermes",
"path": "HuggingFaceTB/smoltalk",
"config_name": "openhermes-100k",
"split": "train",
"weight": 0.15
},
{
"source_name": "self_oss_instruct",
"path": "HuggingFaceTB/smoltalk",
"config_name": "self-oss-instruct",
"split": "train",
"weight": 0.15
},
{
"source_name": "everyday_conversations",
"path": "HuggingFaceTB/smoltalk",
"config_name": "everyday-conversations",
"split": "train",
"weight": 0.01
},
{
"source_name": "numina_cot",
"path": "HuggingFaceTB/smoltalk",
"config_name": "numina-cot-100k",
"split": "train",
"weight": 0.1
},
{
"source_name": "metamathqa",
"path": "HuggingFaceTB/smoltalk",
"config_name": "metamathqa-50k",
"split": "train",
"weight": 0.05
},
{
"source_name": "longalign",
"path": "HuggingFaceTB/smoltalk",
"config_name": "longalign",
"split": "train",
"weight": 0.015
},
{
"source_name": "ultrachat_200k",
"path": "HuggingFaceH4/ultrachat_200k",
"config_name": null,
"split": "train_sft",
"weight": 0.125
}
]
},
"sources": [
{
"name": "smol_magpie_ultra",
"path": "HuggingFaceTB/smoltalk",
"config_name": "smol-magpie-ultra",
"weight": 0.4,
"train_target": 80000,
"val_target": 800,
"train_examples": 80000,
"val_examples": 800,
"rows_seen": 117281,
"skipped_rows": 36481
},
{
"name": "openhermes",
"path": "HuggingFaceTB/smoltalk",
"config_name": "openhermes-100k",
"weight": 0.15,
"train_target": 30000,
"val_target": 300,
"train_examples": 30000,
"val_examples": 300,
"rows_seen": 31945,
"skipped_rows": 1645
},
{
"name": "self_oss_instruct",
"path": "HuggingFaceTB/smoltalk",
"config_name": "self-oss-instruct",
"weight": 0.15,
"train_target": 30000,
"val_target": 300,
"train_examples": 30000,
"val_examples": 300,
"rows_seen": 30300,
"skipped_rows": 0
},
{
"name": "everyday_conversations",
"path": "HuggingFaceTB/smoltalk",
"config_name": "everyday-conversations",
"weight": 0.01,
"train_target": 2000,
"val_target": 20,
"train_examples": 2000,
"val_examples": 20,
"rows_seen": 2020,
"skipped_rows": 0
},
{
"name": "numina_cot",
"path": "HuggingFaceTB/smoltalk",
"config_name": "numina-cot-100k",
"weight": 0.1,
"train_target": 20000,
"val_target": 200,
"train_examples": 20000,
"val_examples": 200,
"rows_seen": 20200,
"skipped_rows": 0
},
{
"name": "metamathqa",
"path": "HuggingFaceTB/smoltalk",
"config_name": "metamathqa-50k",
"weight": 0.05,
"train_target": 10000,
"val_target": 100,
"train_examples": 10000,
"val_examples": 100,
"rows_seen": 10104,
"skipped_rows": 4
},
{
"name": "longalign",
"path": "HuggingFaceTB/smoltalk",
"config_name": "longalign",
"weight": 0.015,
"train_target": 3000,
"val_target": 30,
"train_examples": 3000,
"val_examples": 30,
"rows_seen": 3030,
"skipped_rows": 0
},
{
"name": "ultrachat_200k",
"path": "HuggingFaceH4/ultrachat_200k",
"config_name": null,
"weight": 0.125,
"train_target": 25000,
"val_target": 250,
"train_examples": 25000,
"val_examples": 250,
"rows_seen": 25250,
"skipped_rows": 0
}
],
"tokenizer_meta": {
"vocab_size": 49152,
"special_tokens": {
"pad_token": "<pad>",
"bos_token": "<bos>",
"eos_token": "<eos>",
"unk_token": "<unk>",
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
"unk_token_id": 3
},
"data_config": {
"sources": [
{
"name": "fineweb_edu",
"path": "HuggingFaceFW/fineweb-edu",
"split": "train",
"weight": 0.6,
"text_field": "text",
"config_name": "sample-10BT",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 10000,
"sample_documents": null
},
{
"name": "cosmopedia_v2",
"path": "HuggingFaceTB/smollm-corpus",
"split": "train",
"weight": 0.2,
"text_field": "text",
"config_name": "cosmopedia-v2",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 10000,
"sample_documents": null
},
{
"name": "the_stack_python",
"path": "bigcode/the-stack-dedup",
"split": "train",
"weight": 0.1,
"text_field": "content",
"config_name": null,
"data_dir": "data/python",
"revision": null,
"streaming": true,
"shuffle_buffer": 2000,
"sample_documents": null
},
{
"name": "finemath",
"path": "HuggingFaceTB/finemath",
"split": "train",
"weight": 0.1,
"text_field": "text",
"config_name": "finemath-4plus",
"data_dir": null,
"revision": null,
"streaming": true,
"shuffle_buffer": 5000,
"sample_documents": null
}
],
"tokenizer_sample_documents": 2000000,
"tokenizer_min_frequency": 2,
"tokenizer_special_tokens": [
"<pad>",
"<bos>",
"<eos>",
"<unk>"
],
"train_tokens": 10000000000,
"val_tokens": 20000000,
"shard_size_tokens": 100000000
}
},
"train": {
"num_examples": 200000,
"seq_len": 2048,
"input_ids_path": "train_input_ids.bin",
"labels_path": "train_labels.bin"
},
"val": {
"num_examples": 2000,
"seq_len": 2048,
"input_ids_path": "val_input_ids.bin",
"labels_path": "val_labels.bin"
}
}