| { | |
| "train_micro_batch_size_per_gpu": 2, | |
| "gradient_accumulation_steps": 1, | |
| "steps_per_print": 100, | |
| "gradient_clipping": 1.0, | |
| "fp16": { | |
| "enabled": true, | |
| "loss_scale": 0, | |
| "loss_scale_window": 2000, | |
| "hysteresis": 2, | |
| "min_loss_scale": 0.0 | |
| }, | |
| "zero_optimization": { | |
| "stage": 2, | |
| "reduce_bucket_size": 50000000, | |
| "overlap_comm": true | |
| }, | |
| "sparse_attention": { | |
| "mode": "fixed", | |
| "block": 16, | |
| "different_layout_per_head": true, | |
| "num_local_blocks": 8, | |
| "num_global_blocks": 1, | |
| "attention": "unidirectional", | |
| "horizontal_global_attention": false, | |
| "num_different_global_patterns": 8 | |
| } | |
| } | |