yuyuzhang commited on
Commit
d7cbb90
·
verified ·
1 Parent(s): 8fbce14

Delete cruise_cli.yaml

Browse files
Files changed (1) hide show
  1. cruise_cli.yaml +0 -456
cruise_cli.yaml DELETED
@@ -1,456 +0,0 @@
1
- trainer:
2
- default_root_dir: null
3
- default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/shen.zheng1/seekpath/basemodelv7-lctx/sft-lctx-swebench/run2
4
- logger:
5
- - tracking
6
- - console
7
- log_every_n_steps: 50
8
- benchmark: false
9
- enable_speedmonitor: true
10
- stats_speedmonitor: false
11
- enable_versions: false
12
- detect_anomaly: false
13
- deterministic: false
14
- accelerator: gpu
15
- accelerator_kwargs:
16
- mega_config: null
17
- precision: bf16
18
- max_epochs: 1
19
- max_steps: -1
20
- limit_train_batches: null
21
- limit_val_batches: null
22
- limit_test_batches: null
23
- static_sync_limit_val: false
24
- sync_batchnorm: false
25
- sync_fit_metrics: null
26
- val_check_interval:
27
- - 20000000
28
- save_before_val: false
29
- accumulate_grad_batches: null
30
- gradient_clip_val: 1.0
31
- max_grad_clip: 0.0
32
- seed: null
33
- summarize_model_depth: 0
34
- resume_ckpt_path: null
35
- frozen_ckpt_path: null
36
- resume_strict: true
37
- resume_optimizer: true
38
- resume_metadata: true
39
- resume_loader_state: false
40
- callbacks: null
41
- enable_checkpoint:
42
- - 1
43
- - 10000
44
- checkpoint_monitor: step
45
- checkpoint_mode: max
46
- dataloader_timeout: -1
47
- dataloader_retry_limit: 100
48
- dataloader_retry_persistent_limit: 5
49
- find_unused_parameters: false
50
- project_name: seekpath_v2
51
- experiment_name: P61_D6_8B_npu_8M_tp2_stage2_630CT_FIM
52
- enable_trace: false
53
- reload_dataloaders_every_n_epochs: -1
54
- strategy: megatron
55
- enable_qat: false
56
- no_quant_module: []
57
- enable_ptq: true
58
- qat_kwargs: {}
59
- optimizer_kwargs:
60
- optimizer:
61
- type: adam
62
- params:
63
- lr: 2.0e-05
64
- betas:
65
- - 0.9
66
- - 0.95
67
- eps: 1.0e-08
68
- weight_decay: 0.1
69
- bias_correction: true
70
- adam_w_mode: true
71
- momentum: 0.9
72
- lr_mult_keys: []
73
- no_weight_decay_keys: []
74
- weight_decay_keys: []
75
- lr_mult_start_epoch: 0
76
- lr_mult: 1.0
77
- force_bfloat16_state: false
78
- scheduler:
79
- type: megatron.optimizer_param_schedule.OptimizerParamScheduler
80
- total_steps_param_name: num_training_steps
81
- warmup_steps_param_name: num_warmup_steps
82
- interval: step
83
- params:
84
- warmup_step_rate: 0.0
85
- lr_end: 0.1
86
- lr_decay_style: cosine
87
- lr_decay_rate: 1.0
88
- grad_norm_layers: []
89
- checkpoint_kwargs:
90
- verbose: false
91
- save_last: false
92
- save_weights_only: false
93
- every_n_train_steps: -1
94
- every_n_seconds: -1
95
- save_best: false
96
- storage:
97
- enable_shm_download: false
98
- enable_shm_upload: false
99
- download_thread_num: 16
100
- upload_thread_num: 1
101
- skip_last_dataloader_ckpt: true
102
- magnus_ckpt_path: ''
103
- enable_auto_align_ckpt_path: false
104
- enable_save_checkpoint_async: true
105
- enable_profiler: false
106
- profiler_schedule_kwargs:
107
- wait: 50
108
- warmup: 3
109
- active: 3
110
- repeat: 1
111
- profile_all_ranks: false
112
- enable_bsdp: false
113
- bsdp_num_prefetch: 64
114
- keep_frozen_weights: true
115
- val_reduce_fn: {}
116
- experiment_id: null
117
- enable_omnistore: true
118
- mesh_num_group: -1
119
- mesh_gpus_per_group: -1
120
- model:
121
- network:
122
- scale_attn_weights: true
123
- reorder_and_upcast_attn: false
124
- gradient_checkpointing: false
125
- gradient_checkpointing_ln: false
126
- gradient_checkpointing_mlp: false
127
- gradient_checkpointing_start_layers: 0
128
- use_ft_flash_attn: false
129
- use_ft_linear: false
130
- use_ft_layernorm: false
131
- use_rmpad: true
132
- pad_output: false
133
- value_moe_num_expert: 0
134
- value_moe_qkv_topk: 4
135
- value_moe_qkv_times: 1
136
- value_moe_is_repeat: true
137
- value_moe_expert_type: linear-lego
138
- value_moe_gate_type: default-lego
139
- value_moe_gate_metric_type: default
140
- cont_train_mode: default
141
- exact_token_as_loss_denominator: false
142
- fuse_lora_weight: true
143
- save_mixed_ckpt_in_shards: false
144
- save_mixed_model_states_freq: final
145
- skip_n_iters: -1
146
- hidden_size: 4096
147
- n_embed: 4096
148
- n_inner: 14336
149
- n_head: 32
150
- n_layer: 32
151
- vocab_size: 155136
152
- max_position_embeddings: 32768
153
- cross_entropy_spilt_num: 1
154
- layer_norm_epsilon: 1.0e-05
155
- activation_function: gelu_new
156
- resid_pdrop: 0.1
157
- embd_pdrop: 0.0
158
- attn_pdrop: 0.1
159
- scale_attn_by_inverse_layer_idx: false
160
- initializer_range: 0.009882118
161
- tie_weight: false
162
- pad_idx: 1
163
- use_xperf_rotary: false
164
- fuse_gelu_gemm: false
165
- position_embeddings_type: rope
166
- n_shared_qhead: 4
167
- num_q_heads: -1
168
- num_kv_heads: -1
169
- head_dim: -1
170
- kv_mirror_layers: []
171
- kv_mirror_imitated_layers: []
172
- hidden_decoding_layers: []
173
- hidden_decoding_imitated_layers: []
174
- residual_post_ln_layers: []
175
- hyperconnection_rate: -1
176
- repeat_kv_heads: true
177
- sparse_attention_window_size:
178
- - -1
179
- use_query_swiglu: false
180
- query_swiglu_inner_dim: 8192
181
- force_mem_efficient_layers:
182
- - -1
183
- noop_transformer_layers: []
184
- dense_ffn_layers: []
185
- dense_ffn_type: swiglu
186
- dense_ffn_inner_dim: -1
187
- moe_expert_type: exp-xelego
188
- moe_gate_type: caplog-lego
189
- moe_gate_metric_type: lego
190
- moe_expert_exp_level: 4
191
- moe_expert_exp_first_dim_factor: 1.0
192
- moe_expert_exp_first_num: 2
193
- moe_topk: 5
194
- moe_num_expert: 0
195
- moe_expert_eq_dim_factor: 0.25
196
- moe_backend: default
197
- moe_overlap_recomp_grad_comm: false
198
- moe_expert_op_version: V1
199
- moe_aux_loss_weight: 0.001
200
- moe_gate_dropout: 0.0
201
- moe_use_balance: false
202
- moe_expert_group_capacity: 1.0
203
- moe_expert_group_balance_loss_weight: 0.0
204
- moe_expert_groups_in_ep_rank: 1
205
- moe_enable_warmup: false
206
- moe_swiglu_fc1_2_init_scale: 1.0
207
- janus_use_big_op: false
208
- janus_big_op_version: V1
209
- janus_big_op_attn_grad_accum_fusion: true
210
- janus_p7_big_op_mlp_fwd_rs_fp8_compression: ''
211
- janus_p7_big_op_mlp_bwd_ag_fp8_compression: ''
212
- janus_big_op_offload_enable: false
213
- convert_gate_to_fp32: false
214
- moe_enable_ema_update: 1
215
- query_head_scale_factor: 1
216
- moe_pr_scale_factor: 1.0
217
- moe_pr_expert_type: disabled
218
- lora_rank: 0
219
- rope_mode: default
220
- rope_scale: 1
221
- rope_base: 500000.0
222
- rope_cut: false
223
- rope_cut_head_dim: 0
224
- rope_force_fp32: false
225
- sparse_attention_window_scale: 1
226
- sparse_attention_global_window_size:
227
- - 0
228
- use_attention_bias: false
229
- layer_norm_type: npu_rmsnorm
230
- use_key_layernorm: false
231
- key_norm_after_rope: false
232
- use_query_layernorm: false
233
- use_context_groupnorm: false
234
- use_mariana_gqa_pattern: false
235
- use_sequence_parallel_attention: false
236
- use_sequence_parallel_attention_a2a: false
237
- context_parallel_use_all_gather: false
238
- enable_hybrid_data_parallel: false
239
- cross_entropy_fusion: none
240
- rope_gen_method: loader
241
- fp8_use_bf16_layers: ''
242
- use_lightweight_fp8: false
243
- deterministic_mode: false
244
- megatron_tensor_parallel_size: 8
245
- megatron_pipeline_parallel_size: 1
246
- megatron_context_parallel_size: 1
247
- megatron_expert_parallel_size: 1
248
- megatron_expert_parallel_size_in_dp: 1
249
- megatron_context_parallel_query_only: false
250
- megatron_num_layers_per_virtual_pipeline_stage: 0
251
- megatron_micro_batch_size: 1
252
- megatron_global_batch_size: 32
253
- megatron_sequence_parallel: true
254
- megatron_recompute_granularity: ''
255
- megatron_use_flash_attention: true
256
- megatron_recompute_method: uniform
257
- megatron_recompute_num_layers: 1
258
- megatron_distribute_saved_activations: false
259
- megatron_enable_distributed_optimizer: true
260
- megatron_use_multi_precision_ddp: false
261
- megatron_sequence_parallel_as_data_parallel_in_optimizer: false
262
- megatron_param_alignment_in_bytes: 0
263
- megatron_gather_params_use_alltoall: false
264
- megatron_enable_initial_jit_warmup: true
265
- megatron_accumulate_allreduce_grads_in_fp32: true
266
- megatron_bf16_use_bf16_allreduce_grads: false
267
- megatron_grad_comm_type: ''
268
- megatron_reduce_grads_use_alltoall: false
269
- megatron_scale_loss_in_gradient: false
270
- megatron_scale_gradient_after_allreduce: false
271
- megatron_ddp_impl: local
272
- megatron_bf16_qt: false
273
- megatron_empty_cache_level: 0
274
- megatron_force_fp32_embed: false
275
- megatron_deterministic_flash_attn: false
276
- megatron_switch_pp_and_dp: false
277
- megatron_timing_log_level: 2
278
- megatron_no_load_rng: false
279
- megatron_no_save_rng: false
280
- megatron_no_load_optim: false
281
- megatron_mem_efficient_column_parallel: true
282
- megatron_masked_softmax_fusion: true
283
- megatron_bias_gelu_fusion: false
284
- megatron_bias_dropout_fusion: false
285
- megatron_gradient_accumulation_fusion: true
286
- megatron_overlap_p2p_comm: false
287
- megatron_deallocate_pipeline_outputs: true
288
- megatron_timing_log_option: local
289
- megatron_barrier_with_L1_time: false
290
- megatron_strict_align_diff_with_ds: false
291
- megatron_parallel_linear_force_weight_contiguous: false
292
- megatron_use_mariana_softmax: false
293
- megatron_use_mariana_activation: false
294
- megatron_overlap_data_parallel_communication: false
295
- megatron_overlap_dp_grad_comm: false
296
- megatron_overlap_dp_param_comm: false
297
- megatron_early_prefetch_dp_allgather: true
298
- megatron_use_non_sequential_block: false
299
- megatron_overlap_attn_grad_input_comm: true
300
- megatron_sequence_data_parallel_size: -1
301
- megatron_distributed_sequence_parallel_size: -1
302
- megatron_num_layers_for_pipeline_stages: []
303
- megatron_vocab_parallel_embedding_fusion: false
304
- megatron_embedding_reduce_scatter_for_sp: true
305
- megatron_print_args: true
306
- megatron_grad_norm_skip: -1.0
307
- megatron_reorder_wgrad: false
308
- megatron_lm_logits_reorder_wgrad: false
309
- megatron_lm_logits_lastn_wgrad: 0
310
- megatron_offload_activations: false
311
- megatron_offload_ratio: 1.0
312
- megatron_offload_launch_ratio: 1.0
313
- megatron_optimizer_offload_main_param: false
314
- megatron_optimizer_offload_state: false
315
- megatron_optimizer_offload_overlap_with_dp: false
316
- megatron_data_parallel_random_init: false
317
- megatron_pipeline_strategy: ''
318
- megatron_pipeline_wgrad_strategy: ''
319
- megatron_pipeline_warmup_overlap: false
320
- megatron_pipeline_fuse1f1b: false
321
- megatron_allow_transformer_engine: false
322
- megatron_fp8_e4m3: false
323
- megatron_fp8_hybrid: false
324
- megatron_fp8_wgrad: true
325
- megatron_fp8_dgrad: true
326
- megatron_fp8_margin: 0
327
- megatron_fp8_interval: 1
328
- megatron_transformer_impl: local
329
- megatron_fp8_amax_history_len: 1024
330
- megatron_fp8_amax_compute_algo: max
331
- megatron_use_qlora: false
332
- megatron_qlora_quant_weight_dtype: null
333
- megatron_qlora_quant_real_store: false
334
- megatron_qlora_quant_groupsize: -1
335
- megatron_qlora_quant_input_dtype: ''
336
- megatron_qlora_quant_aware_lora: false
337
- megatron_qlora_quant_aware_L4Q: false
338
- megatron_terapipe_nano_batch_size: -1
339
- lora_config:
340
- default:
341
- lora_dropout: 0.0
342
- lora_rank: 64
343
- layers:
344
- - all
345
- init_method: normal
346
- init_mode: nonzero_parallel_init
347
- init_kwargs: {}
348
- lora_alpha: 2.0
349
- use_rslora: true
350
- lora_experts_appr: full
351
- use_qlora: false
352
- qlora_quant_weight_dtype: null
353
- qlora_quant_real_store: false
354
- qlora_quant_aware_L4Q: false
355
- qlora_quant_groupsize: -1
356
- qlora_quant_input_dtype: None
357
- qlora_quant_aware_lora: false
358
- post_training_quant: false
359
- fully_sharded: false
360
- emb_trainable: true
361
- target_modules:
362
- - query_key_value
363
- - experts
364
- - dense
365
- query_key_value:
366
- lora_rank: -1
367
- lora_alpha: -1.0
368
- experts:
369
- lora_rank: -1
370
- lora_alpha: -1.0
371
- dense:
372
- lora_rank: -1
373
- lora_alpha: -1.0
374
- dense_h_to_4h:
375
- lora_rank: -1
376
- lora_alpha: -1.0
377
- dense_4h_to_h:
378
- lora_rank: -1
379
- lora_alpha: -1.0
380
- freeze_prefix: null
381
- partial_pretrain: hdfs://haruna/home/byte_data_seed/ssd_hldy/evals_pipeline/checkpoints/20250224/home/byte_data_seed/hdd_hldy/user/sujing.29/seekpath/P61_D6_8B_8M_tp2_stage2_630CT_FIM_LCTX_GPU/checkpoints/global_step_206000/megatron_merge_states.pt
382
- partial_pretrain_rename: null
383
- reset_global_step: -1
384
- override_lr_scheduler: true
385
- start_debug_server: false
386
- clip_token_ids: false
387
- data:
388
- train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/shen-sft/data/sft-lctx-swebench
389
- val_path: ''
390
- train_size: 15177351717
391
- val_size: -1
392
- train_batch_size: 32
393
- train_num_workers: 4
394
- val_batch_size: -1
395
- val_num_workers: 1
396
- max_seq_len: 32768
397
- val_max_seq_len: -1
398
- text_keys:
399
- - content_split
400
- tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret
401
- gpu_prefetch: false
402
- cpu_prefetch: false
403
- dyn_bsz: true
404
- dyn_bsz_margin: 0.0
405
- stride: -1
406
- warmup_step_rate: -1.0
407
- tokenizer_type: bbpe
408
- bsz_warmup: true
409
- bsz_warmup_rate: 0.03
410
- return_source: true
411
- synthetic_sample: false
412
- synthetic_batch: false
413
- seq_lens: null
414
- seq_probs: null
415
- enable_sampling_ratios: false
416
- train_path_with_ratio: null
417
- src_weights: null
418
- parse_aug_data: true
419
- loader_accumulate: -1
420
- bsz_warmup_warmup_step_rate: 0.0
421
- max_epochs: 1
422
- pad_idx: 1
423
- strategy: megatron
424
- megatron_micro_batch_size: 1
425
- use_rmpad: true
426
- hidden_size: 4096
427
- megatron_sequence_parallel: true
428
- max_position_embeddings: 32768
429
- position_embeddings_type: rope
430
- use_sequence_parallel_attention: false
431
- use_sequence_parallel_attention_a2a: false
432
- resume_ckpt_path: ''
433
- val_override_est_steps: false
434
- init_without_cli: true
435
- rope_mode: default
436
- rope_scale: 1
437
- rope_base: 500000.0
438
- rope_cut: false
439
- rope_cut_head_dim: 0
440
- init_val_loader_worker_beforehand: false
441
- megatron_global_batch_size: 1
442
- megatron_tensor_parallel_size: 1
443
- megatron_pipeline_parallel_size: 1
444
- n_head: 1
445
- log_level: INFO
446
- val_only: false
447
- download_ckpt_in_shards: true
448
- gc_interval: 50
449
- disable_ckpt_verifier: false
450
- profiler_at_iter: -1
451
- timer_at_iter: -1
452
- profile_all_ranks: false
453
- profile_ranks: []
454
- profile_every_n_steps: -1
455
- profiler_memory_at_iter: null
456
- profile_max_preview_rank: 0