trainer: default_root_dir: null default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/shen.zheng1/seekpath/basemodelv7-lctx/sft-lctx-swebench/run2 logger: - tracking - console log_every_n_steps: 50 benchmark: false enable_speedmonitor: true stats_speedmonitor: false enable_versions: false detect_anomaly: false deterministic: false accelerator: gpu accelerator_kwargs: mega_config: null precision: bf16 max_epochs: 1 max_steps: -1 limit_train_batches: null limit_val_batches: null limit_test_batches: null static_sync_limit_val: false sync_batchnorm: false sync_fit_metrics: null val_check_interval: - 20000000 save_before_val: false accumulate_grad_batches: null gradient_clip_val: 1.0 max_grad_clip: 0.0 seed: null summarize_model_depth: 0 resume_ckpt_path: null frozen_ckpt_path: null resume_strict: true resume_optimizer: true resume_metadata: true resume_loader_state: false callbacks: null enable_checkpoint: - 1 - 10000 checkpoint_monitor: step checkpoint_mode: max dataloader_timeout: -1 dataloader_retry_limit: 100 dataloader_retry_persistent_limit: 5 find_unused_parameters: false project_name: seekpath_v2 experiment_name: P61_D6_8B_npu_8M_tp2_stage2_630CT_FIM enable_trace: false reload_dataloaders_every_n_epochs: -1 strategy: megatron enable_qat: false no_quant_module: [] enable_ptq: true qat_kwargs: {} optimizer_kwargs: optimizer: type: adam params: lr: 2.0e-05 betas: - 0.9 - 0.95 eps: 1.0e-08 weight_decay: 0.1 bias_correction: true adam_w_mode: true momentum: 0.9 lr_mult_keys: [] no_weight_decay_keys: [] weight_decay_keys: [] lr_mult_start_epoch: 0 lr_mult: 1.0 force_bfloat16_state: false scheduler: type: megatron.optimizer_param_schedule.OptimizerParamScheduler total_steps_param_name: num_training_steps warmup_steps_param_name: num_warmup_steps interval: step params: warmup_step_rate: 0.0 lr_end: 0.1 lr_decay_style: cosine lr_decay_rate: 1.0 grad_norm_layers: [] checkpoint_kwargs: verbose: false save_last: false save_weights_only: false every_n_train_steps: -1 every_n_seconds: -1 save_best: false storage: enable_shm_download: false enable_shm_upload: false download_thread_num: 16 upload_thread_num: 1 skip_last_dataloader_ckpt: true magnus_ckpt_path: '' enable_auto_align_ckpt_path: false enable_save_checkpoint_async: true enable_profiler: false profiler_schedule_kwargs: wait: 50 warmup: 3 active: 3 repeat: 1 profile_all_ranks: false enable_bsdp: false bsdp_num_prefetch: 64 keep_frozen_weights: true val_reduce_fn: {} experiment_id: null enable_omnistore: true mesh_num_group: -1 mesh_gpus_per_group: -1 model: network: scale_attn_weights: true reorder_and_upcast_attn: false gradient_checkpointing: false gradient_checkpointing_ln: false gradient_checkpointing_mlp: false gradient_checkpointing_start_layers: 0 use_ft_flash_attn: false use_ft_linear: false use_ft_layernorm: false use_rmpad: true pad_output: false value_moe_num_expert: 0 value_moe_qkv_topk: 4 value_moe_qkv_times: 1 value_moe_is_repeat: true value_moe_expert_type: linear-lego value_moe_gate_type: default-lego value_moe_gate_metric_type: default cont_train_mode: default exact_token_as_loss_denominator: false fuse_lora_weight: true save_mixed_ckpt_in_shards: false save_mixed_model_states_freq: final skip_n_iters: -1 hidden_size: 4096 n_embed: 4096 n_inner: 14336 n_head: 32 n_layer: 32 vocab_size: 155136 max_position_embeddings: 32768 cross_entropy_spilt_num: 1 layer_norm_epsilon: 1.0e-05 activation_function: gelu_new resid_pdrop: 0.1 embd_pdrop: 0.0 attn_pdrop: 0.1 scale_attn_by_inverse_layer_idx: false initializer_range: 0.009882118 tie_weight: false pad_idx: 1 use_xperf_rotary: false fuse_gelu_gemm: false position_embeddings_type: rope n_shared_qhead: 4 num_q_heads: -1 num_kv_heads: -1 head_dim: -1 kv_mirror_layers: [] kv_mirror_imitated_layers: [] hidden_decoding_layers: [] hidden_decoding_imitated_layers: [] residual_post_ln_layers: [] hyperconnection_rate: -1 repeat_kv_heads: true sparse_attention_window_size: - -1 use_query_swiglu: false query_swiglu_inner_dim: 8192 force_mem_efficient_layers: - -1 noop_transformer_layers: [] dense_ffn_layers: [] dense_ffn_type: swiglu dense_ffn_inner_dim: -1 moe_expert_type: exp-xelego moe_gate_type: caplog-lego moe_gate_metric_type: lego moe_expert_exp_level: 4 moe_expert_exp_first_dim_factor: 1.0 moe_expert_exp_first_num: 2 moe_topk: 5 moe_num_expert: 0 moe_expert_eq_dim_factor: 0.25 moe_backend: default moe_overlap_recomp_grad_comm: false moe_expert_op_version: V1 moe_aux_loss_weight: 0.001 moe_gate_dropout: 0.0 moe_use_balance: false moe_expert_group_capacity: 1.0 moe_expert_group_balance_loss_weight: 0.0 moe_expert_groups_in_ep_rank: 1 moe_enable_warmup: false moe_swiglu_fc1_2_init_scale: 1.0 janus_use_big_op: false janus_big_op_version: V1 janus_big_op_attn_grad_accum_fusion: true janus_p7_big_op_mlp_fwd_rs_fp8_compression: '' janus_p7_big_op_mlp_bwd_ag_fp8_compression: '' janus_big_op_offload_enable: false convert_gate_to_fp32: false moe_enable_ema_update: 1 query_head_scale_factor: 1 moe_pr_scale_factor: 1.0 moe_pr_expert_type: disabled lora_rank: 0 rope_mode: default rope_scale: 1 rope_base: 500000.0 rope_cut: false rope_cut_head_dim: 0 rope_force_fp32: false sparse_attention_window_scale: 1 sparse_attention_global_window_size: - 0 use_attention_bias: false layer_norm_type: npu_rmsnorm use_key_layernorm: false key_norm_after_rope: false use_query_layernorm: false use_context_groupnorm: false use_mariana_gqa_pattern: false use_sequence_parallel_attention: false use_sequence_parallel_attention_a2a: false context_parallel_use_all_gather: false enable_hybrid_data_parallel: false cross_entropy_fusion: none rope_gen_method: loader fp8_use_bf16_layers: '' use_lightweight_fp8: false deterministic_mode: false megatron_tensor_parallel_size: 8 megatron_pipeline_parallel_size: 1 megatron_context_parallel_size: 1 megatron_expert_parallel_size: 1 megatron_expert_parallel_size_in_dp: 1 megatron_context_parallel_query_only: false megatron_num_layers_per_virtual_pipeline_stage: 0 megatron_micro_batch_size: 1 megatron_global_batch_size: 32 megatron_sequence_parallel: true megatron_recompute_granularity: '' megatron_use_flash_attention: true megatron_recompute_method: uniform megatron_recompute_num_layers: 1 megatron_distribute_saved_activations: false megatron_enable_distributed_optimizer: true megatron_use_multi_precision_ddp: false megatron_sequence_parallel_as_data_parallel_in_optimizer: false megatron_param_alignment_in_bytes: 0 megatron_gather_params_use_alltoall: false megatron_enable_initial_jit_warmup: true megatron_accumulate_allreduce_grads_in_fp32: true megatron_bf16_use_bf16_allreduce_grads: false megatron_grad_comm_type: '' megatron_reduce_grads_use_alltoall: false megatron_scale_loss_in_gradient: false megatron_scale_gradient_after_allreduce: false megatron_ddp_impl: local megatron_bf16_qt: false megatron_empty_cache_level: 0 megatron_force_fp32_embed: false megatron_deterministic_flash_attn: false megatron_switch_pp_and_dp: false megatron_timing_log_level: 2 megatron_no_load_rng: false megatron_no_save_rng: false megatron_no_load_optim: false megatron_mem_efficient_column_parallel: true megatron_masked_softmax_fusion: true megatron_bias_gelu_fusion: false megatron_bias_dropout_fusion: false megatron_gradient_accumulation_fusion: true megatron_overlap_p2p_comm: false megatron_deallocate_pipeline_outputs: true megatron_timing_log_option: local megatron_barrier_with_L1_time: false megatron_strict_align_diff_with_ds: false megatron_parallel_linear_force_weight_contiguous: false megatron_use_mariana_softmax: false megatron_use_mariana_activation: false megatron_overlap_data_parallel_communication: false megatron_overlap_dp_grad_comm: false megatron_overlap_dp_param_comm: false megatron_early_prefetch_dp_allgather: true megatron_use_non_sequential_block: false megatron_overlap_attn_grad_input_comm: true megatron_sequence_data_parallel_size: -1 megatron_distributed_sequence_parallel_size: -1 megatron_num_layers_for_pipeline_stages: [] megatron_vocab_parallel_embedding_fusion: false megatron_embedding_reduce_scatter_for_sp: true megatron_print_args: true megatron_grad_norm_skip: -1.0 megatron_reorder_wgrad: false megatron_lm_logits_reorder_wgrad: false megatron_lm_logits_lastn_wgrad: 0 megatron_offload_activations: false megatron_offload_ratio: 1.0 megatron_offload_launch_ratio: 1.0 megatron_optimizer_offload_main_param: false megatron_optimizer_offload_state: false megatron_optimizer_offload_overlap_with_dp: false megatron_data_parallel_random_init: false megatron_pipeline_strategy: '' megatron_pipeline_wgrad_strategy: '' megatron_pipeline_warmup_overlap: false megatron_pipeline_fuse1f1b: false megatron_allow_transformer_engine: false megatron_fp8_e4m3: false megatron_fp8_hybrid: false megatron_fp8_wgrad: true megatron_fp8_dgrad: true megatron_fp8_margin: 0 megatron_fp8_interval: 1 megatron_transformer_impl: local megatron_fp8_amax_history_len: 1024 megatron_fp8_amax_compute_algo: max megatron_use_qlora: false megatron_qlora_quant_weight_dtype: null megatron_qlora_quant_real_store: false megatron_qlora_quant_groupsize: -1 megatron_qlora_quant_input_dtype: '' megatron_qlora_quant_aware_lora: false megatron_qlora_quant_aware_L4Q: false megatron_terapipe_nano_batch_size: -1 lora_config: default: lora_dropout: 0.0 lora_rank: 64 layers: - all init_method: normal init_mode: nonzero_parallel_init init_kwargs: {} lora_alpha: 2.0 use_rslora: true lora_experts_appr: full use_qlora: false qlora_quant_weight_dtype: null qlora_quant_real_store: false qlora_quant_aware_L4Q: false qlora_quant_groupsize: -1 qlora_quant_input_dtype: None qlora_quant_aware_lora: false post_training_quant: false fully_sharded: false emb_trainable: true target_modules: - query_key_value - experts - dense query_key_value: lora_rank: -1 lora_alpha: -1.0 experts: lora_rank: -1 lora_alpha: -1.0 dense: lora_rank: -1 lora_alpha: -1.0 dense_h_to_4h: lora_rank: -1 lora_alpha: -1.0 dense_4h_to_h: lora_rank: -1 lora_alpha: -1.0 freeze_prefix: null partial_pretrain: hdfs://haruna/home/byte_data_seed/ssd_hldy/evals_pipeline/checkpoints/20250224/home/byte_data_seed/hdd_hldy/user/sujing.29/seekpath/P61_D6_8B_8M_tp2_stage2_630CT_FIM_LCTX_GPU/checkpoints/global_step_206000/megatron_merge_states.pt partial_pretrain_rename: null reset_global_step: -1 override_lr_scheduler: true start_debug_server: false clip_token_ids: false data: train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/shen-sft/data/sft-lctx-swebench val_path: '' train_size: 15177351717 val_size: -1 train_batch_size: 32 train_num_workers: 4 val_batch_size: -1 val_num_workers: 1 max_seq_len: 32768 val_max_seq_len: -1 text_keys: - content_split tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret gpu_prefetch: false cpu_prefetch: false dyn_bsz: true dyn_bsz_margin: 0.0 stride: -1 warmup_step_rate: -1.0 tokenizer_type: bbpe bsz_warmup: true bsz_warmup_rate: 0.03 return_source: true synthetic_sample: false synthetic_batch: false seq_lens: null seq_probs: null enable_sampling_ratios: false train_path_with_ratio: null src_weights: null parse_aug_data: true loader_accumulate: -1 bsz_warmup_warmup_step_rate: 0.0 max_epochs: 1 pad_idx: 1 strategy: megatron megatron_micro_batch_size: 1 use_rmpad: true hidden_size: 4096 megatron_sequence_parallel: true max_position_embeddings: 32768 position_embeddings_type: rope use_sequence_parallel_attention: false use_sequence_parallel_attention_a2a: false resume_ckpt_path: '' val_override_est_steps: false init_without_cli: true rope_mode: default rope_scale: 1 rope_base: 500000.0 rope_cut: false rope_cut_head_dim: 0 init_val_loader_worker_beforehand: false megatron_global_batch_size: 1 megatron_tensor_parallel_size: 1 megatron_pipeline_parallel_size: 1 n_head: 1 log_level: INFO val_only: false download_ckpt_in_shards: true gc_interval: 50 disable_ckpt_verifier: false profiler_at_iter: -1 timer_at_iter: -1 profile_all_ranks: false profile_ranks: [] profile_every_n_steps: -1 profiler_memory_at_iter: null profile_max_preview_rank: 0