{ "bf16": { "enabled": true }, "zero_optimization": { "stage": 3, "overlap_comm": false, "contiguous_gradients": true, /* 下面三项是按 40GB 显存收紧的桶大小与持久化阈值 */ "reduce_bucket_size": 200000000, "stage3_prefetch_bucket_size": 50000000, "stage3_param_persistence_threshold": 1000000, "stage3_gather_16bit_weights_on_model_save": true }, "aio": { "block_size": 1048576, "queue_depth": 8, "single_submit": false, "overlap_events": true, "thread_count": 1 }, /* 交由命令行的 per_device_train_batch_size / gradient_accumulation_steps 控制 */ "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "zero_force_ds_cpu_optimizer": false, "zero_allow_untested_optimizer": true, "wall_clock_breakdown": false }