aadityabuilds's picture
Training in progress, step 600
cfcc5a3 verified
{
"cli_args": {
"attn_implementation": "eager",
"auto_resume": true,
"bf16": true,
"cache_dir": "/cache",
"data_dir": "/workspace/data/kernelbook",
"deepspeed": "/workspace/configs/deepspeed/zero3_bf16.json",
"distillation_alpha": 0.5,
"distillation_topk": 100,
"distillation_weight": 1.0,
"dry_run": false,
"effective_batch_size": 8,
"eval_steps": 100,
"fsdp_transformer_layer_cls": "Qwen2DecoderLayer",
"fsdp_use_orig_params": false,
"generate_from_teacher": true,
"generation_batch_size": null,
"gradient_accumulation_steps": 2,
"gradient_checkpointing": true,
"hub_model_id": "aadityabuilds/qwen2-5-coder-7b-kernelbook-sdft",
"learning_rate": 5e-06,
"logging_steps": 5,
"max_completion_length": 4096,
"max_eval_samples": 256,
"max_grad_norm": 1.0,
"max_prompt_length": 4096,
"max_steps": -1,
"max_train_samples": null,
"model": "/cache/local-models/sdft/qwen2-5-coder-7b-instruct",
"num_generations": 1,
"num_generations_eval": null,
"num_loss_tokens_to_skip": 0,
"num_train_epochs": 1.0,
"output_dir": "/__modal/volumes/vo-qWxmkR9prkx4LKrjcfqOmD/modal-sdft-qwen2-5-coder-7b-kernelbook-final",
"output_root": "/outputs",
"parallel_backend": "deepspeed",
"per_device_eval_batch_size": 1,
"per_device_train_batch_size": 1,
"push_to_hub": true,
"ref_model_mixup_alpha": 0.01,
"ref_model_sync_steps": 128,
"repetition_penalty": 1.0,
"report_to": "wandb",
"resume_from_checkpoint": null,
"run_name": "modal-sdft-qwen2-5-coder-7b-kernelbook-final",
"save_steps": 50,
"save_total_limit": 10,
"seed": 42,
"steps_per_generation": null,
"sync_ref_model": false,
"target_global_batch_size": 8,
"temperature": 0.7,
"top_k": 0,
"top_p": 0.95,
"wandb_entity": null,
"wandb_mode": "online",
"wandb_project": "triton-sdft",
"warmup_ratio": 0.03,
"weight_decay": 0.01,
"world_size": 4
},
"data_dir": "/workspace/data/kernelbook",
"effective_batch_size": 8,
"manifest": {
"config": {
"created_at": "2026-05-27T05:16:47.175016+00:00",
"dataset_id": "GPUMODE/KernelBook",
"max_output_tokens": 4096,
"max_seq_length": 8192,
"model": "Qwen/Qwen2.5-Coder-7B-Instruct",
"output_dir": "data/kernelbook",
"seed": 42,
"test_ratio": 0.1,
"train_ratio": 0.8,
"val_ratio": 0.1
},
"counts": {
"after_dedup": 15203,
"after_empty_filter": 18162,
"after_output_length_filter": 13267,
"loaded": 18162,
"test": 1360,
"train": 10578,
"validation": 1329
},
"sdft_trainer": {
"eval_dataset": "data/kernelbook/text/sdft/validation",
"sdft_config_hints": {
"generate_from_teacher": true,
"max_completion_length": 4096,
"max_prompt_length": 4096
},
"test_dataset": "data/kernelbook/text/sdft/test",
"train_dataset": "data/kernelbook/text/sdft/train"
},
"sft_trainer": {
"eval_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/validation",
"eval_packing": false,
"packing": true,
"requires_columns": [
"input_ids",
"completion_mask"
],
"sft_config": {
"completion_only_loss": true,
"eval_packing": false,
"max_length": 8192,
"packing": true
},
"test_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/test",
"train_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/train"
},
"token_stats": {
"test": {
"count": 1360.0,
"max": 6072.0,
"min": 519.0,
"p50": 1742.5,
"p90": 3393.1000000000013,
"p95": 4133.1,
"p99": 4980.400000000003,
"truncated_fraction": 0.0
},
"train": {
"count": 10578.0,
"max": 7026.0,
"min": 517.0,
"p50": 1781.5,
"p90": 3559.0,
"p95": 4168.299999999999,
"p99": 4932.459999999999,
"truncated_fraction": 0.0
},
"validation": {
"count": 1329.0,
"max": 7012.0,
"min": 519.0,
"p50": 1787.0,
"p90": 3371.2,
"p95": 3914.3999999999996,
"p99": 4647.0,
"truncated_fraction": 0.0
}
}
},
"method": "sdft",
"model": "/cache/local-models/sdft/qwen2-5-coder-7b-instruct",
"output_dir": "/__modal/volumes/vo-qWxmkR9prkx4LKrjcfqOmD/modal-sdft-qwen2-5-coder-7b-kernelbook-final",
"run_name": "modal-sdft-qwen2-5-coder-7b-kernelbook-final",
"world_size": 4
}