File size: 13,725 Bytes
ed7513c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
trainer:
  default_root_dir: null
  default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/shen.zheng1/seekpath/basemodelv7-lctx/sft-lctx-swebench/run2
  logger:
  - tracking
  - console
  log_every_n_steps: 50
  benchmark: false
  enable_speedmonitor: true
  stats_speedmonitor: false
  enable_versions: false
  detect_anomaly: false
  deterministic: false
  accelerator: gpu
  accelerator_kwargs:
    mega_config: null
  precision: bf16
  max_epochs: 1
  max_steps: -1
  limit_train_batches: null
  limit_val_batches: null
  limit_test_batches: null
  static_sync_limit_val: false
  sync_batchnorm: false
  sync_fit_metrics: null
  val_check_interval:
  - 20000000
  save_before_val: false
  accumulate_grad_batches: null
  gradient_clip_val: 1.0
  max_grad_clip: 0.0
  seed: null
  summarize_model_depth: 0
  resume_ckpt_path: null
  frozen_ckpt_path: null
  resume_strict: true
  resume_optimizer: true
  resume_metadata: true
  resume_loader_state: false
  callbacks: null
  enable_checkpoint:
  - 1
  - 10000
  checkpoint_monitor: step
  checkpoint_mode: max
  dataloader_timeout: -1
  dataloader_retry_limit: 100
  dataloader_retry_persistent_limit: 5
  find_unused_parameters: false
  project_name: seekpath_v2
  experiment_name: P61_D6_8B_npu_8M_tp2_stage2_630CT_FIM
  enable_trace: false
  reload_dataloaders_every_n_epochs: -1
  strategy: megatron
  enable_qat: false
  no_quant_module: []
  enable_ptq: true
  qat_kwargs: {}
  optimizer_kwargs:
    optimizer:
      type: adam
      params:
        lr: 2.0e-05
        betas:
        - 0.9
        - 0.95
        eps: 1.0e-08
        weight_decay: 0.1
        bias_correction: true
        adam_w_mode: true
        momentum: 0.9
        lr_mult_keys: []
        no_weight_decay_keys: []
        weight_decay_keys: []
        lr_mult_start_epoch: 0
        lr_mult: 1.0
        force_bfloat16_state: false
    scheduler:
      type: megatron.optimizer_param_schedule.OptimizerParamScheduler
      total_steps_param_name: num_training_steps
      warmup_steps_param_name: num_warmup_steps
      interval: step
      params:
        warmup_step_rate: 0.0
        lr_end: 0.1
        lr_decay_style: cosine
        lr_decay_rate: 1.0
  grad_norm_layers: []
  checkpoint_kwargs:
    verbose: false
    save_last: false
    save_weights_only: false
    every_n_train_steps: -1
    every_n_seconds: -1
    save_best: false
    storage:
      enable_shm_download: false
      enable_shm_upload: false
      download_thread_num: 16
      upload_thread_num: 1
    skip_last_dataloader_ckpt: true
    magnus_ckpt_path: ''
    enable_auto_align_ckpt_path: false
  enable_save_checkpoint_async: true
  enable_profiler: false
  profiler_schedule_kwargs:
    wait: 50
    warmup: 3
    active: 3
    repeat: 1
  profile_all_ranks: false
  enable_bsdp: false
  bsdp_num_prefetch: 64
  keep_frozen_weights: true
  val_reduce_fn: {}
  experiment_id: null
  enable_omnistore: true
  mesh_num_group: -1
  mesh_gpus_per_group: -1
model:
  network:
    scale_attn_weights: true
    reorder_and_upcast_attn: false
    gradient_checkpointing: false
    gradient_checkpointing_ln: false
    gradient_checkpointing_mlp: false
    gradient_checkpointing_start_layers: 0
    use_ft_flash_attn: false
    use_ft_linear: false
    use_ft_layernorm: false
    use_rmpad: true
    pad_output: false
    value_moe_num_expert: 0
    value_moe_qkv_topk: 4
    value_moe_qkv_times: 1
    value_moe_is_repeat: true
    value_moe_expert_type: linear-lego
    value_moe_gate_type: default-lego
    value_moe_gate_metric_type: default
    cont_train_mode: default
    exact_token_as_loss_denominator: false
    fuse_lora_weight: true
    save_mixed_ckpt_in_shards: false
    save_mixed_model_states_freq: final
    skip_n_iters: -1
    hidden_size: 4096
    n_embed: 4096
    n_inner: 14336
    n_head: 32
    n_layer: 32
    vocab_size: 155136
    max_position_embeddings: 32768
    cross_entropy_spilt_num: 1
    layer_norm_epsilon: 1.0e-05
    activation_function: gelu_new
    resid_pdrop: 0.1
    embd_pdrop: 0.0
    attn_pdrop: 0.1
    scale_attn_by_inverse_layer_idx: false
    initializer_range: 0.009882118
    tie_weight: false
    pad_idx: 1
    use_xperf_rotary: false
    fuse_gelu_gemm: false
    position_embeddings_type: rope
    n_shared_qhead: 4
    num_q_heads: -1
    num_kv_heads: -1
    head_dim: -1
    kv_mirror_layers: []
    kv_mirror_imitated_layers: []
    hidden_decoding_layers: []
    hidden_decoding_imitated_layers: []
    residual_post_ln_layers: []
    hyperconnection_rate: -1
    repeat_kv_heads: true
    sparse_attention_window_size:
    - -1
    use_query_swiglu: false
    query_swiglu_inner_dim: 8192
    force_mem_efficient_layers:
    - -1
    noop_transformer_layers: []
    dense_ffn_layers: []
    dense_ffn_type: swiglu
    dense_ffn_inner_dim: -1
    moe_expert_type: exp-xelego
    moe_gate_type: caplog-lego
    moe_gate_metric_type: lego
    moe_expert_exp_level: 4
    moe_expert_exp_first_dim_factor: 1.0
    moe_expert_exp_first_num: 2
    moe_topk: 5
    moe_num_expert: 0
    moe_expert_eq_dim_factor: 0.25
    moe_backend: default
    moe_overlap_recomp_grad_comm: false
    moe_expert_op_version: V1
    moe_aux_loss_weight: 0.001
    moe_gate_dropout: 0.0
    moe_use_balance: false
    moe_expert_group_capacity: 1.0
    moe_expert_group_balance_loss_weight: 0.0
    moe_expert_groups_in_ep_rank: 1
    moe_enable_warmup: false
    moe_swiglu_fc1_2_init_scale: 1.0
    janus_use_big_op: false
    janus_big_op_version: V1
    janus_big_op_attn_grad_accum_fusion: true
    janus_p7_big_op_mlp_fwd_rs_fp8_compression: ''
    janus_p7_big_op_mlp_bwd_ag_fp8_compression: ''
    janus_big_op_offload_enable: false
    convert_gate_to_fp32: false
    moe_enable_ema_update: 1
    query_head_scale_factor: 1
    moe_pr_scale_factor: 1.0
    moe_pr_expert_type: disabled
    lora_rank: 0
    rope_mode: default
    rope_scale: 1
    rope_base: 500000.0
    rope_cut: false
    rope_cut_head_dim: 0
    rope_force_fp32: false
    sparse_attention_window_scale: 1
    sparse_attention_global_window_size:
    - 0
    use_attention_bias: false
    layer_norm_type: npu_rmsnorm
    use_key_layernorm: false
    key_norm_after_rope: false
    use_query_layernorm: false
    use_context_groupnorm: false
    use_mariana_gqa_pattern: false
    use_sequence_parallel_attention: false
    use_sequence_parallel_attention_a2a: false
    context_parallel_use_all_gather: false
    enable_hybrid_data_parallel: false
    cross_entropy_fusion: none
    rope_gen_method: loader
    fp8_use_bf16_layers: ''
    use_lightweight_fp8: false
    deterministic_mode: false
    megatron_tensor_parallel_size: 8
    megatron_pipeline_parallel_size: 1
    megatron_context_parallel_size: 1
    megatron_expert_parallel_size: 1
    megatron_expert_parallel_size_in_dp: 1
    megatron_context_parallel_query_only: false
    megatron_num_layers_per_virtual_pipeline_stage: 0
    megatron_micro_batch_size: 1
    megatron_global_batch_size: 32
    megatron_sequence_parallel: true
    megatron_recompute_granularity: ''
    megatron_use_flash_attention: true
    megatron_recompute_method: uniform
    megatron_recompute_num_layers: 1
    megatron_distribute_saved_activations: false
    megatron_enable_distributed_optimizer: true
    megatron_use_multi_precision_ddp: false
    megatron_sequence_parallel_as_data_parallel_in_optimizer: false
    megatron_param_alignment_in_bytes: 0
    megatron_gather_params_use_alltoall: false
    megatron_enable_initial_jit_warmup: true
    megatron_accumulate_allreduce_grads_in_fp32: true
    megatron_bf16_use_bf16_allreduce_grads: false
    megatron_grad_comm_type: ''
    megatron_reduce_grads_use_alltoall: false
    megatron_scale_loss_in_gradient: false
    megatron_scale_gradient_after_allreduce: false
    megatron_ddp_impl: local
    megatron_bf16_qt: false
    megatron_empty_cache_level: 0
    megatron_force_fp32_embed: false
    megatron_deterministic_flash_attn: false
    megatron_switch_pp_and_dp: false
    megatron_timing_log_level: 2
    megatron_no_load_rng: false
    megatron_no_save_rng: false
    megatron_no_load_optim: false
    megatron_mem_efficient_column_parallel: true
    megatron_masked_softmax_fusion: true
    megatron_bias_gelu_fusion: false
    megatron_bias_dropout_fusion: false
    megatron_gradient_accumulation_fusion: true
    megatron_overlap_p2p_comm: false
    megatron_deallocate_pipeline_outputs: true
    megatron_timing_log_option: local
    megatron_barrier_with_L1_time: false
    megatron_strict_align_diff_with_ds: false
    megatron_parallel_linear_force_weight_contiguous: false
    megatron_use_mariana_softmax: false
    megatron_use_mariana_activation: false
    megatron_overlap_data_parallel_communication: false
    megatron_overlap_dp_grad_comm: false
    megatron_overlap_dp_param_comm: false
    megatron_early_prefetch_dp_allgather: true
    megatron_use_non_sequential_block: false
    megatron_overlap_attn_grad_input_comm: true
    megatron_sequence_data_parallel_size: -1
    megatron_distributed_sequence_parallel_size: -1
    megatron_num_layers_for_pipeline_stages: []
    megatron_vocab_parallel_embedding_fusion: false
    megatron_embedding_reduce_scatter_for_sp: true
    megatron_print_args: true
    megatron_grad_norm_skip: -1.0
    megatron_reorder_wgrad: false
    megatron_lm_logits_reorder_wgrad: false
    megatron_lm_logits_lastn_wgrad: 0
    megatron_offload_activations: false
    megatron_offload_ratio: 1.0
    megatron_offload_launch_ratio: 1.0
    megatron_optimizer_offload_main_param: false
    megatron_optimizer_offload_state: false
    megatron_optimizer_offload_overlap_with_dp: false
    megatron_data_parallel_random_init: false
    megatron_pipeline_strategy: ''
    megatron_pipeline_wgrad_strategy: ''
    megatron_pipeline_warmup_overlap: false
    megatron_pipeline_fuse1f1b: false
    megatron_allow_transformer_engine: false
    megatron_fp8_e4m3: false
    megatron_fp8_hybrid: false
    megatron_fp8_wgrad: true
    megatron_fp8_dgrad: true
    megatron_fp8_margin: 0
    megatron_fp8_interval: 1
    megatron_transformer_impl: local
    megatron_fp8_amax_history_len: 1024
    megatron_fp8_amax_compute_algo: max
    megatron_use_qlora: false
    megatron_qlora_quant_weight_dtype: null
    megatron_qlora_quant_real_store: false
    megatron_qlora_quant_groupsize: -1
    megatron_qlora_quant_input_dtype: ''
    megatron_qlora_quant_aware_lora: false
    megatron_qlora_quant_aware_L4Q: false
    megatron_terapipe_nano_batch_size: -1
  lora_config:
    default:
      lora_dropout: 0.0
      lora_rank: 64
      layers:
      - all
      init_method: normal
      init_mode: nonzero_parallel_init
      init_kwargs: {}
      lora_alpha: 2.0
      use_rslora: true
      lora_experts_appr: full
      use_qlora: false
      qlora_quant_weight_dtype: null
      qlora_quant_real_store: false
      qlora_quant_aware_L4Q: false
      qlora_quant_groupsize: -1
      qlora_quant_input_dtype: None
      qlora_quant_aware_lora: false
      post_training_quant: false
      fully_sharded: false
      emb_trainable: true
    target_modules:
    - query_key_value
    - experts
    - dense
    query_key_value:
      lora_rank: -1
      lora_alpha: -1.0
    experts:
      lora_rank: -1
      lora_alpha: -1.0
    dense:
      lora_rank: -1
      lora_alpha: -1.0
    dense_h_to_4h:
      lora_rank: -1
      lora_alpha: -1.0
    dense_4h_to_h:
      lora_rank: -1
      lora_alpha: -1.0
  freeze_prefix: null
  partial_pretrain: hdfs://haruna/home/byte_data_seed/ssd_hldy/evals_pipeline/checkpoints/20250224/home/byte_data_seed/hdd_hldy/user/sujing.29/seekpath/P61_D6_8B_8M_tp2_stage2_630CT_FIM_LCTX_GPU/checkpoints/global_step_206000/megatron_merge_states.pt
  partial_pretrain_rename: null
  reset_global_step: -1
  override_lr_scheduler: true
  start_debug_server: false
  clip_token_ids: false
data:
  train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/shen-sft/data/sft-lctx-swebench
  val_path: ''
  train_size: 15177351717
  val_size: -1
  train_batch_size: 32
  train_num_workers: 4
  val_batch_size: -1
  val_num_workers: 1
  max_seq_len: 32768
  val_max_seq_len: -1
  text_keys:
  - content_split
  tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret
  gpu_prefetch: false
  cpu_prefetch: false
  dyn_bsz: true
  dyn_bsz_margin: 0.0
  stride: -1
  warmup_step_rate: -1.0
  tokenizer_type: bbpe
  bsz_warmup: true
  bsz_warmup_rate: 0.03
  return_source: true
  synthetic_sample: false
  synthetic_batch: false
  seq_lens: null
  seq_probs: null
  enable_sampling_ratios: false
  train_path_with_ratio: null
  src_weights: null
  parse_aug_data: true
  loader_accumulate: -1
  bsz_warmup_warmup_step_rate: 0.0
  max_epochs: 1
  pad_idx: 1
  strategy: megatron
  megatron_micro_batch_size: 1
  use_rmpad: true
  hidden_size: 4096
  megatron_sequence_parallel: true
  max_position_embeddings: 32768
  position_embeddings_type: rope
  use_sequence_parallel_attention: false
  use_sequence_parallel_attention_a2a: false
  resume_ckpt_path: ''
  val_override_est_steps: false
  init_without_cli: true
  rope_mode: default
  rope_scale: 1
  rope_base: 500000.0
  rope_cut: false
  rope_cut_head_dim: 0
  init_val_loader_worker_beforehand: false
  megatron_global_batch_size: 1
  megatron_tensor_parallel_size: 1
  megatron_pipeline_parallel_size: 1
  n_head: 1
log_level: INFO
val_only: false
download_ckpt_in_shards: true
gc_interval: 50
disable_ckpt_verifier: false
profiler_at_iter: -1
timer_at_iter: -1
profile_all_ranks: false
profile_ranks: []
profile_every_n_steps: -1
profiler_memory_at_iter: null
profile_max_preview_rank: 0