| { |
| "config": { |
| "val_examples": 2000, |
| "max_train_examples": 200000, |
| "min_supervised_tokens": 16, |
| "shuffle": true, |
| "format": "messages", |
| "messages_field": "messages", |
| "sources": [ |
| { |
| "source_name": "smol_magpie_ultra", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "smol-magpie-ultra", |
| "split": "train", |
| "weight": 0.4, |
| "row_filters": { |
| "quality": "good" |
| } |
| }, |
| { |
| "source_name": "openhermes", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "openhermes-100k", |
| "split": "train", |
| "weight": 0.15 |
| }, |
| { |
| "source_name": "self_oss_instruct", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "self-oss-instruct", |
| "split": "train", |
| "weight": 0.15 |
| }, |
| { |
| "source_name": "everyday_conversations", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "everyday-conversations", |
| "split": "train", |
| "weight": 0.01 |
| }, |
| { |
| "source_name": "numina_cot", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "numina-cot-100k", |
| "split": "train", |
| "weight": 0.1 |
| }, |
| { |
| "source_name": "metamathqa", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "metamathqa-50k", |
| "split": "train", |
| "weight": 0.05 |
| }, |
| { |
| "source_name": "longalign", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "longalign", |
| "split": "train", |
| "weight": 0.015 |
| }, |
| { |
| "source_name": "ultrachat_200k", |
| "path": "HuggingFaceH4/ultrachat_200k", |
| "config_name": null, |
| "split": "train_sft", |
| "weight": 0.125 |
| } |
| ] |
| }, |
| "sources": [ |
| { |
| "name": "smol_magpie_ultra", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "smol-magpie-ultra", |
| "weight": 0.4, |
| "train_target": 80000, |
| "val_target": 800, |
| "train_examples": 80000, |
| "val_examples": 800, |
| "rows_seen": 117281, |
| "skipped_rows": 36481 |
| }, |
| { |
| "name": "openhermes", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "openhermes-100k", |
| "weight": 0.15, |
| "train_target": 30000, |
| "val_target": 300, |
| "train_examples": 30000, |
| "val_examples": 300, |
| "rows_seen": 31945, |
| "skipped_rows": 1645 |
| }, |
| { |
| "name": "self_oss_instruct", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "self-oss-instruct", |
| "weight": 0.15, |
| "train_target": 30000, |
| "val_target": 300, |
| "train_examples": 30000, |
| "val_examples": 300, |
| "rows_seen": 30300, |
| "skipped_rows": 0 |
| }, |
| { |
| "name": "everyday_conversations", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "everyday-conversations", |
| "weight": 0.01, |
| "train_target": 2000, |
| "val_target": 20, |
| "train_examples": 2000, |
| "val_examples": 20, |
| "rows_seen": 2020, |
| "skipped_rows": 0 |
| }, |
| { |
| "name": "numina_cot", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "numina-cot-100k", |
| "weight": 0.1, |
| "train_target": 20000, |
| "val_target": 200, |
| "train_examples": 20000, |
| "val_examples": 200, |
| "rows_seen": 20200, |
| "skipped_rows": 0 |
| }, |
| { |
| "name": "metamathqa", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "metamathqa-50k", |
| "weight": 0.05, |
| "train_target": 10000, |
| "val_target": 100, |
| "train_examples": 10000, |
| "val_examples": 100, |
| "rows_seen": 10104, |
| "skipped_rows": 4 |
| }, |
| { |
| "name": "longalign", |
| "path": "HuggingFaceTB/smoltalk", |
| "config_name": "longalign", |
| "weight": 0.015, |
| "train_target": 3000, |
| "val_target": 30, |
| "train_examples": 3000, |
| "val_examples": 30, |
| "rows_seen": 3030, |
| "skipped_rows": 0 |
| }, |
| { |
| "name": "ultrachat_200k", |
| "path": "HuggingFaceH4/ultrachat_200k", |
| "config_name": null, |
| "weight": 0.125, |
| "train_target": 25000, |
| "val_target": 250, |
| "train_examples": 25000, |
| "val_examples": 250, |
| "rows_seen": 25250, |
| "skipped_rows": 0 |
| } |
| ], |
| "tokenizer_meta": { |
| "vocab_size": 49152, |
| "special_tokens": { |
| "pad_token": "<pad>", |
| "bos_token": "<bos>", |
| "eos_token": "<eos>", |
| "unk_token": "<unk>", |
| "pad_token_id": 0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "unk_token_id": 3 |
| }, |
| "data_config": { |
| "sources": [ |
| { |
| "name": "fineweb_edu", |
| "path": "HuggingFaceFW/fineweb-edu", |
| "split": "train", |
| "weight": 0.6, |
| "text_field": "text", |
| "config_name": "sample-10BT", |
| "data_dir": null, |
| "revision": null, |
| "streaming": true, |
| "shuffle_buffer": 10000, |
| "sample_documents": null |
| }, |
| { |
| "name": "cosmopedia_v2", |
| "path": "HuggingFaceTB/smollm-corpus", |
| "split": "train", |
| "weight": 0.2, |
| "text_field": "text", |
| "config_name": "cosmopedia-v2", |
| "data_dir": null, |
| "revision": null, |
| "streaming": true, |
| "shuffle_buffer": 10000, |
| "sample_documents": null |
| }, |
| { |
| "name": "the_stack_python", |
| "path": "bigcode/the-stack-dedup", |
| "split": "train", |
| "weight": 0.1, |
| "text_field": "content", |
| "config_name": null, |
| "data_dir": "data/python", |
| "revision": null, |
| "streaming": true, |
| "shuffle_buffer": 2000, |
| "sample_documents": null |
| }, |
| { |
| "name": "finemath", |
| "path": "HuggingFaceTB/finemath", |
| "split": "train", |
| "weight": 0.1, |
| "text_field": "text", |
| "config_name": "finemath-4plus", |
| "data_dir": null, |
| "revision": null, |
| "streaming": true, |
| "shuffle_buffer": 5000, |
| "sample_documents": null |
| } |
| ], |
| "tokenizer_sample_documents": 2000000, |
| "tokenizer_min_frequency": 2, |
| "tokenizer_special_tokens": [ |
| "<pad>", |
| "<bos>", |
| "<eos>", |
| "<unk>" |
| ], |
| "train_tokens": 10000000000, |
| "val_tokens": 20000000, |
| "shard_size_tokens": 100000000 |
| } |
| }, |
| "train": { |
| "num_examples": 200000, |
| "seq_len": 2048, |
| "input_ids_path": "train_input_ids.bin", |
| "labels_path": "train_labels.bin" |
| }, |
| "val": { |
| "num_examples": 2000, |
| "seq_len": 2048, |
| "input_ids_path": "val_input_ids.bin", |
| "labels_path": "val_labels.bin" |
| } |
| } |