{ "cli_args": { "attn_implementation": "eager", "auto_resume": true, "bf16": true, "cache_dir": "/cache", "data_dir": "/workspace/data/kernelbook", "deepspeed": "/workspace/configs/deepspeed/zero3_bf16.json", "distillation_alpha": 0.5, "distillation_topk": 100, "distillation_weight": 1.0, "dry_run": false, "effective_batch_size": 8, "eval_steps": 100, "fsdp_transformer_layer_cls": "Qwen2DecoderLayer", "fsdp_use_orig_params": false, "generate_from_teacher": true, "generation_batch_size": null, "gradient_accumulation_steps": 2, "gradient_checkpointing": true, "hub_model_id": "aadityabuilds/qwen2-5-coder-7b-kernelbook-sdft", "learning_rate": 5e-06, "logging_steps": 5, "max_completion_length": 4096, "max_eval_samples": 256, "max_grad_norm": 1.0, "max_prompt_length": 4096, "max_steps": -1, "max_train_samples": null, "model": "/cache/local-models/sdft/qwen2-5-coder-7b-instruct", "num_generations": 1, "num_generations_eval": null, "num_loss_tokens_to_skip": 0, "num_train_epochs": 1.0, "output_dir": "/__modal/volumes/vo-qWxmkR9prkx4LKrjcfqOmD/modal-sdft-qwen2-5-coder-7b-kernelbook-final", "output_root": "/outputs", "parallel_backend": "deepspeed", "per_device_eval_batch_size": 1, "per_device_train_batch_size": 1, "push_to_hub": true, "ref_model_mixup_alpha": 0.01, "ref_model_sync_steps": 128, "repetition_penalty": 1.0, "report_to": "wandb", "resume_from_checkpoint": null, "run_name": "modal-sdft-qwen2-5-coder-7b-kernelbook-final", "save_steps": 50, "save_total_limit": 10, "seed": 42, "steps_per_generation": null, "sync_ref_model": false, "target_global_batch_size": 8, "temperature": 0.7, "top_k": 0, "top_p": 0.95, "wandb_entity": null, "wandb_mode": "online", "wandb_project": "triton-sdft", "warmup_ratio": 0.03, "weight_decay": 0.01, "world_size": 4 }, "data_dir": "/workspace/data/kernelbook", "effective_batch_size": 8, "manifest": { "config": { "created_at": "2026-05-27T05:16:47.175016+00:00", "dataset_id": "GPUMODE/KernelBook", "max_output_tokens": 4096, "max_seq_length": 8192, "model": "Qwen/Qwen2.5-Coder-7B-Instruct", "output_dir": "data/kernelbook", "seed": 42, "test_ratio": 0.1, "train_ratio": 0.8, "val_ratio": 0.1 }, "counts": { "after_dedup": 15203, "after_empty_filter": 18162, "after_output_length_filter": 13267, "loaded": 18162, "test": 1360, "train": 10578, "validation": 1329 }, "sdft_trainer": { "eval_dataset": "data/kernelbook/text/sdft/validation", "sdft_config_hints": { "generate_from_teacher": true, "max_completion_length": 4096, "max_prompt_length": 4096 }, "test_dataset": "data/kernelbook/text/sdft/test", "train_dataset": "data/kernelbook/text/sdft/train" }, "sft_trainer": { "eval_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/validation", "eval_packing": false, "packing": true, "requires_columns": [ "input_ids", "completion_mask" ], "sft_config": { "completion_only_loss": true, "eval_packing": false, "max_length": 8192, "packing": true }, "test_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/test", "train_dataset": "data/kernelbook/tokenized/Qwen2.5-Coder-7B-Instruct/train" }, "token_stats": { "test": { "count": 1360.0, "max": 6072.0, "min": 519.0, "p50": 1742.5, "p90": 3393.1000000000013, "p95": 4133.1, "p99": 4980.400000000003, "truncated_fraction": 0.0 }, "train": { "count": 10578.0, "max": 7026.0, "min": 517.0, "p50": 1781.5, "p90": 3559.0, "p95": 4168.299999999999, "p99": 4932.459999999999, "truncated_fraction": 0.0 }, "validation": { "count": 1329.0, "max": 7012.0, "min": 519.0, "p50": 1787.0, "p90": 3371.2, "p95": 3914.3999999999996, "p99": 4647.0, "truncated_fraction": 0.0 } } }, "method": "sdft", "model": "/cache/local-models/sdft/qwen2-5-coder-7b-instruct", "output_dir": "/__modal/volumes/vo-qWxmkR9prkx4LKrjcfqOmD/modal-sdft-qwen2-5-coder-7b-kernelbook-final", "run_name": "modal-sdft-qwen2-5-coder-7b-kernelbook-final", "world_size": 4 }