{ "config": { "val_examples": 2000, "max_train_examples": 200000, "min_supervised_tokens": 16, "shuffle": true, "format": "messages", "messages_field": "messages", "sources": [ { "source_name": "smol_magpie_ultra", "path": "HuggingFaceTB/smoltalk", "config_name": "smol-magpie-ultra", "split": "train", "weight": 0.4, "row_filters": { "quality": "good" } }, { "source_name": "openhermes", "path": "HuggingFaceTB/smoltalk", "config_name": "openhermes-100k", "split": "train", "weight": 0.15 }, { "source_name": "self_oss_instruct", "path": "HuggingFaceTB/smoltalk", "config_name": "self-oss-instruct", "split": "train", "weight": 0.15 }, { "source_name": "everyday_conversations", "path": "HuggingFaceTB/smoltalk", "config_name": "everyday-conversations", "split": "train", "weight": 0.01 }, { "source_name": "numina_cot", "path": "HuggingFaceTB/smoltalk", "config_name": "numina-cot-100k", "split": "train", "weight": 0.1 }, { "source_name": "metamathqa", "path": "HuggingFaceTB/smoltalk", "config_name": "metamathqa-50k", "split": "train", "weight": 0.05 }, { "source_name": "longalign", "path": "HuggingFaceTB/smoltalk", "config_name": "longalign", "split": "train", "weight": 0.015 }, { "source_name": "ultrachat_200k", "path": "HuggingFaceH4/ultrachat_200k", "config_name": null, "split": "train_sft", "weight": 0.125 } ] }, "sources": [ { "name": "smol_magpie_ultra", "path": "HuggingFaceTB/smoltalk", "config_name": "smol-magpie-ultra", "weight": 0.4, "train_target": 80000, "val_target": 800, "train_examples": 80000, "val_examples": 800, "rows_seen": 117281, "skipped_rows": 36481 }, { "name": "openhermes", "path": "HuggingFaceTB/smoltalk", "config_name": "openhermes-100k", "weight": 0.15, "train_target": 30000, "val_target": 300, "train_examples": 30000, "val_examples": 300, "rows_seen": 31945, "skipped_rows": 1645 }, { "name": "self_oss_instruct", "path": "HuggingFaceTB/smoltalk", "config_name": "self-oss-instruct", "weight": 0.15, "train_target": 30000, "val_target": 300, "train_examples": 30000, "val_examples": 300, "rows_seen": 30300, "skipped_rows": 0 }, { "name": "everyday_conversations", "path": "HuggingFaceTB/smoltalk", "config_name": "everyday-conversations", "weight": 0.01, "train_target": 2000, "val_target": 20, "train_examples": 2000, "val_examples": 20, "rows_seen": 2020, "skipped_rows": 0 }, { "name": "numina_cot", "path": "HuggingFaceTB/smoltalk", "config_name": "numina-cot-100k", "weight": 0.1, "train_target": 20000, "val_target": 200, "train_examples": 20000, "val_examples": 200, "rows_seen": 20200, "skipped_rows": 0 }, { "name": "metamathqa", "path": "HuggingFaceTB/smoltalk", "config_name": "metamathqa-50k", "weight": 0.05, "train_target": 10000, "val_target": 100, "train_examples": 10000, "val_examples": 100, "rows_seen": 10104, "skipped_rows": 4 }, { "name": "longalign", "path": "HuggingFaceTB/smoltalk", "config_name": "longalign", "weight": 0.015, "train_target": 3000, "val_target": 30, "train_examples": 3000, "val_examples": 30, "rows_seen": 3030, "skipped_rows": 0 }, { "name": "ultrachat_200k", "path": "HuggingFaceH4/ultrachat_200k", "config_name": null, "weight": 0.125, "train_target": 25000, "val_target": 250, "train_examples": 25000, "val_examples": 250, "rows_seen": 25250, "skipped_rows": 0 } ], "tokenizer_meta": { "vocab_size": 49152, "special_tokens": { "pad_token": "", "bos_token": "", "eos_token": "", "unk_token": "", "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2, "unk_token_id": 3 }, "data_config": { "sources": [ { "name": "fineweb_edu", "path": "HuggingFaceFW/fineweb-edu", "split": "train", "weight": 0.6, "text_field": "text", "config_name": "sample-10BT", "data_dir": null, "revision": null, "streaming": true, "shuffle_buffer": 10000, "sample_documents": null }, { "name": "cosmopedia_v2", "path": "HuggingFaceTB/smollm-corpus", "split": "train", "weight": 0.2, "text_field": "text", "config_name": "cosmopedia-v2", "data_dir": null, "revision": null, "streaming": true, "shuffle_buffer": 10000, "sample_documents": null }, { "name": "the_stack_python", "path": "bigcode/the-stack-dedup", "split": "train", "weight": 0.1, "text_field": "content", "config_name": null, "data_dir": "data/python", "revision": null, "streaming": true, "shuffle_buffer": 2000, "sample_documents": null }, { "name": "finemath", "path": "HuggingFaceTB/finemath", "split": "train", "weight": 0.1, "text_field": "text", "config_name": "finemath-4plus", "data_dir": null, "revision": null, "streaming": true, "shuffle_buffer": 5000, "sample_documents": null } ], "tokenizer_sample_documents": 2000000, "tokenizer_min_frequency": 2, "tokenizer_special_tokens": [ "", "", "", "" ], "train_tokens": 10000000000, "val_tokens": 20000000, "shard_size_tokens": 100000000 } }, "train": { "num_examples": 200000, "seq_len": 2048, "input_ids_path": "train_input_ids.bin", "labels_path": "train_labels.bin" }, "val": { "num_examples": 2000, "seq_len": 2048, "input_ids_path": "val_input_ids.bin", "labels_path": "val_labels.bin" } }