| { | |
| "train_batch_size": 16, | |
| "gradient_accumulation_steps": 1, | |
| "optimizer": { | |
| "type": "ZeroOneAdam", | |
| "params": { | |
| "lr": 1e-3, | |
| "weight_decay": 0.01, | |
| "bias_correction": false, | |
| "var_freeze_step": 1000, | |
| "var_update_scaler": 16, | |
| "local_step_scaler": 1000, | |
| "local_step_clipper": 16, | |
| "cuda_aware": false, | |
| "comm_backend_name": "nccl" | |
| } | |
| }, | |
| "fp16": { | |
| "enabled": true | |
| }, | |
| "zero_optimization": { | |
| "stage": 3, | |
| "contiguous_gradients": true, | |
| "stage3_max_live_parameters": 1e9, | |
| "stage3_max_reuse_distance": 1e9, | |
| "stage3_prefetch_bucket_size": 1e7, | |
| "stage3_param_persistence_threshold": 1e5, | |
| "reduce_bucket_size": 1e7, | |
| "sub_group_size": 1e9 | |
| }, | |
| "steps_per_print": 50 | |
| } | |