Trouter-Library commited on
Commit
e6fc028
·
verified ·
1 Parent(s): 870da8b

Create optimizer_config.json

Browse files
Files changed (1) hide show
  1. optimizer_config.json +256 -0
optimizer_config.json ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "optimizer_name": "AdamW",
3
+ "optimizer_class": "torch.optim.AdamW",
4
+
5
+ "hyperparameters": {
6
+ "learning_rate": 3.0e-4,
7
+ "betas": [0.9, 0.95],
8
+ "eps": 1.0e-8,
9
+ "weight_decay": 0.1,
10
+ "amsgrad": false,
11
+ "maximize": false,
12
+ "foreach": null,
13
+ "capturable": false,
14
+ "differentiable": false,
15
+ "fused": true
16
+ },
17
+
18
+ "learning_rate_schedule": {
19
+ "scheduler_type": "cosine_with_warmup",
20
+ "warmup_steps": 2000,
21
+ "total_steps": 875000,
22
+ "min_lr": 3.0e-5,
23
+ "max_lr": 3.0e-4,
24
+ "warmup_init_lr": 0.0,
25
+ "cycle_mult": 1.0,
26
+ "last_epoch": -1
27
+ },
28
+
29
+ "gradient_configuration": {
30
+ "max_grad_norm": 1.0,
31
+ "gradient_accumulation_steps": 8,
32
+ "gradient_checkpointing": true,
33
+ "gradient_checkpointing_policy": "full_deterministic"
34
+ },
35
+
36
+ "mixed_precision": {
37
+ "enabled": true,
38
+ "dtype": "bfloat16",
39
+ "loss_scale": "dynamic",
40
+ "initial_scale_power": 16,
41
+ "scale_growth_factor": 2.0,
42
+ "backoff_factor": 0.5,
43
+ "scale_window": 1000,
44
+ "hysteresis": 2,
45
+ "min_loss_scale": 1.0
46
+ },
47
+
48
+ "optimization_stages": [
49
+ {
50
+ "stage": "pretraining",
51
+ "steps": 750000,
52
+ "learning_rate": 3.0e-4,
53
+ "weight_decay": 0.1,
54
+ "scheduler": "cosine",
55
+ "warmup_steps": 2000,
56
+ "description": "Initial pretraining phase on diverse corpus"
57
+ },
58
+ {
59
+ "stage": "domain_adaptation",
60
+ "steps": 80000,
61
+ "learning_rate": 1.0e-4,
62
+ "weight_decay": 0.1,
63
+ "scheduler": "constant",
64
+ "warmup_steps": 0,
65
+ "description": "Continued pretraining on domain-specific data"
66
+ },
67
+ {
68
+ "stage": "instruction_tuning",
69
+ "steps": 45000,
70
+ "learning_rate": 5.0e-5,
71
+ "weight_decay": 0.01,
72
+ "scheduler": "linear_decay",
73
+ "warmup_steps": 500,
74
+ "description": "Fine-tuning for instruction following"
75
+ }
76
+ ],
77
+
78
+ "parameter_groups": [
79
+ {
80
+ "name": "embeddings",
81
+ "modules": ["model.embed_tokens", "lm_head"],
82
+ "learning_rate_multiplier": 1.0,
83
+ "weight_decay": 0.1
84
+ },
85
+ {
86
+ "name": "attention",
87
+ "modules": [
88
+ "self_attn.q_proj",
89
+ "self_attn.k_proj",
90
+ "self_attn.v_proj",
91
+ "self_attn.o_proj"
92
+ ],
93
+ "learning_rate_multiplier": 1.0,
94
+ "weight_decay": 0.1
95
+ },
96
+ {
97
+ "name": "mlp",
98
+ "modules": [
99
+ "mlp.gate_proj",
100
+ "mlp.up_proj",
101
+ "mlp.down_proj"
102
+ ],
103
+ "learning_rate_multiplier": 1.0,
104
+ "weight_decay": 0.1
105
+ },
106
+ {
107
+ "name": "layer_norms",
108
+ "modules": [
109
+ "input_layernorm",
110
+ "post_attention_layernorm",
111
+ "model.norm"
112
+ ],
113
+ "learning_rate_multiplier": 1.0,
114
+ "weight_decay": 0.0
115
+ }
116
+ ],
117
+
118
+ "advanced_techniques": {
119
+ "layer_wise_lr_decay": {
120
+ "enabled": false,
121
+ "decay_rate": 0.95,
122
+ "description": "Apply learning rate decay by layer depth"
123
+ },
124
+
125
+ "warmup_schedule": {
126
+ "type": "linear",
127
+ "steps": 2000,
128
+ "start_lr": 0.0,
129
+ "target_lr": 3.0e-4
130
+ },
131
+
132
+ "gradient_clipping": {
133
+ "method": "norm",
134
+ "max_norm": 1.0,
135
+ "norm_type": 2.0
136
+ },
137
+
138
+ "optimizer_state_sharding": {
139
+ "enabled": true,
140
+ "strategy": "zero_stage_2",
141
+ "offload_optimizer": false,
142
+ "offload_params": false
143
+ }
144
+ },
145
+
146
+ "memory_optimization": {
147
+ "cpu_offload": {
148
+ "enabled": false,
149
+ "offload_optimizer_states": false,
150
+ "offload_params": false,
151
+ "pin_memory": true
152
+ },
153
+
154
+ "activation_checkpointing": {
155
+ "enabled": true,
156
+ "checkpoint_every_n_layers": 1,
157
+ "use_reentrant": false
158
+ },
159
+
160
+ "zero_optimization": {
161
+ "stage": 2,
162
+ "offload_optimizer": false,
163
+ "offload_param": false,
164
+ "overlap_comm": true,
165
+ "contiguous_gradients": true,
166
+ "reduce_bucket_size": 5.0e8,
167
+ "allgather_bucket_size": 5.0e8,
168
+ "sub_group_size": 1.0e9,
169
+ "round_robin_gradients": false
170
+ }
171
+ },
172
+
173
+ "monitoring": {
174
+ "log_optimizer_states": true,
175
+ "log_learning_rate": true,
176
+ "log_gradient_norm": true,
177
+ "log_parameter_norm": true,
178
+ "log_interval": 100,
179
+
180
+ "tracked_metrics": [
181
+ "lr",
182
+ "grad_norm",
183
+ "param_norm",
184
+ "loss_scale",
185
+ "overflow_count",
186
+ "step_time",
187
+ "samples_per_second",
188
+ "tokens_per_second"
189
+ ]
190
+ },
191
+
192
+ "convergence_criteria": {
193
+ "max_steps": 875000,
194
+ "early_stopping": {
195
+ "enabled": false,
196
+ "patience": 10000,
197
+ "min_delta": 0.001,
198
+ "monitor": "eval_loss"
199
+ },
200
+
201
+ "plateau_detection": {
202
+ "enabled": true,
203
+ "patience": 5000,
204
+ "threshold": 0.001,
205
+ "cooldown": 1000
206
+ }
207
+ },
208
+
209
+ "stability_features": {
210
+ "loss_spike_detection": {
211
+ "enabled": true,
212
+ "threshold": 2.0,
213
+ "window_size": 100,
214
+ "action": "skip_update"
215
+ },
216
+
217
+ "gradient_overflow_detection": {
218
+ "enabled": true,
219
+ "max_overflow_count": 10,
220
+ "action": "reduce_loss_scale"
221
+ },
222
+
223
+ "nan_inf_detection": {
224
+ "enabled": true,
225
+ "check_frequency": 100,
226
+ "action": "rollback_checkpoint"
227
+ }
228
+ },
229
+
230
+ "distributed_optimization": {
231
+ "backend": "nccl",
232
+ "gradient_as_bucket_view": true,
233
+ "static_graph": false,
234
+ "ddp_bucket_cap_mb": 25,
235
+ "find_unused_parameters": false,
236
+ "broadcast_buffers": true,
237
+
238
+ "communication_optimization": {
239
+ "fp16_reduce_scatter": false,
240
+ "bf16_reduce_scatter": true,
241
+ "bucket_size_multiplier": 1.0,
242
+ "overlap_grad_reduce": true,
243
+ "use_multi_stream": true
244
+ }
245
+ },
246
+
247
+ "checkpointing": {
248
+ "save_optimizer_states": true,
249
+ "save_scheduler_states": true,
250
+ "save_rng_states": true,
251
+ "checkpoint_format": "pytorch",
252
+ "async_save": true,
253
+ "save_interval_steps": 5000,
254
+ "keep_last_n_checkpoints": 10
255
+ }
256
+ }