{ "bf16": { "enabled": true }, "zero_optimization": { "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "offload_param": { "device": "cpu", "pin_memory": true }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e9, "reduce_bucket_size": 5e8, "stage3_prefetch_bucket_size": 5e8, "stage3_param_persistence_threshold": 1e6, "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_16bit_weights_on_model_save": true }, "gradient_accumulation_steps": 32, "gradient_clipping": 1.0, "steps_per_print": 10, "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false, "communication_data_type": "bf16", "prescale_gradients": false, "sparse_gradients": false, "compression_training": { "weight_quantization": { "shared_parameters": {}, "different_groups": {} }, "activation_quantization": { "shared_parameters": {}, "different_groups": {} }, "sparse_pruning": { "shared_parameters": {}, "different_groups": {} } }, "flops_profiler": { "enabled": false, "profile_step": 1, "module_depth": -1, "top_modules": 1, "detailed": true, "output_file": null }, "tensorboard": { "enabled": true, "output_path": "./logs/tensorboard", "job_name": "helion_v2_training" } }