| trainer: |
| default_root_dir: null |
| default_hdfs_dir: hdfs://haruna/home/byte_data_seed/hdd_hldy/user/yuyu.zhang/seekpath/P61_D6_8B_8M_tp2_stage2_H800_code |
| logger: |
| - tracking |
| - console |
| log_every_n_steps: 50 |
| benchmark: false |
| enable_speedmonitor: true |
| stats_speedmonitor: false |
| enable_versions: false |
| detect_anomaly: false |
| deterministic: false |
| accelerator: gpu |
| accelerator_kwargs: |
| mega_config: null |
| precision: bf16 |
| max_epochs: 1 |
| max_steps: -1 |
| limit_train_batches: null |
| limit_val_batches: null |
| limit_test_batches: null |
| static_sync_limit_val: false |
| sync_batchnorm: false |
| sync_fit_metrics: null |
| val_check_interval: |
| - 20000000 |
| save_before_val: false |
| accumulate_grad_batches: null |
| gradient_clip_val: 1.0 |
| max_grad_clip: 0.0 |
| seed: null |
| summarize_model_depth: 0 |
| resume_ckpt_path: auto |
| frozen_ckpt_path: null |
| resume_strict: true |
| resume_optimizer: true |
| resume_metadata: true |
| resume_loader_state: false |
| callbacks: null |
| enable_checkpoint: |
| - 1 |
| - 10000 |
| checkpoint_monitor: step |
| checkpoint_mode: max |
| dataloader_timeout: -1 |
| dataloader_retry_limit: 100 |
| dataloader_retry_persistent_limit: 5 |
| find_unused_parameters: false |
| project_name: seekpath_v3 |
| experiment_name: P61_D6_8B_npu_8M_tp2_H800stage1_code |
| enable_trace: false |
| reload_dataloaders_every_n_epochs: -1 |
| strategy: megatron |
| enable_qat: false |
| no_quant_module: [] |
| enable_ptq: true |
| qat_kwargs: {} |
| optimizer_kwargs: |
| optimizer: |
| type: adam |
| params: |
| lr: 3.0e-05 |
| betas: |
| - 0.9 |
| - 0.95 |
| eps: 1.0e-08 |
| weight_decay: 0.1 |
| bias_correction: true |
| adam_w_mode: true |
| momentum: 0.9 |
| lr_mult_keys: [] |
| no_weight_decay_keys: [] |
| weight_decay_keys: [] |
| lr_mult_start_epoch: 0 |
| lr_mult: 1.0 |
| scheduler: |
| type: megatron.optimizer_param_schedule.OptimizerParamScheduler |
| total_steps_param_name: num_training_steps |
| warmup_steps_param_name: num_warmup_steps |
| interval: step |
| params: |
| warmup_step_rate: 0.002 |
| lr_end: 0.1 |
| lr_decay_style: constant |
| lr_decay_rate: 1.0 |
| grad_norm_layers: [] |
| checkpoint_kwargs: |
| verbose: false |
| save_last: false |
| save_weights_only: false |
| every_n_train_steps: -1 |
| every_n_seconds: -1 |
| save_best: false |
| storage: |
| enable_shm_download: false |
| enable_shm_upload: false |
| download_thread_num: 16 |
| upload_thread_num: 1 |
| enable_save_checkpoint_async: true |
| enable_profiler: false |
| profiler_schedule_kwargs: |
| wait: 50 |
| warmup: 3 |
| active: 3 |
| repeat: 1 |
| profile_all_ranks: false |
| enable_bsdp: false |
| bsdp_num_prefetch: 64 |
| keep_frozen_weights: true |
| val_reduce_fn: {} |
| experiment_id: null |
| enable_omnistore: false |
| model: |
| network: |
| hidden_size: 4096 |
| n_embed: 4096 |
| n_inner: 14336 |
| n_head: 32 |
| n_layer: 32 |
| vocab_size: 155136 |
| max_position_embeddings: 32768 |
| cross_entropy_spilt_num: 1 |
| layer_norm_epsilon: 1.0e-05 |
| activation_function: gelu_new |
| resid_pdrop: 0.1 |
| embd_pdrop: 0.0 |
| attn_pdrop: 0.1 |
| scale_attn_weights: true |
| scale_attn_by_inverse_layer_idx: false |
| reorder_and_upcast_attn: false |
| initializer_range: 0.009882118 |
| gradient_checkpointing: false |
| gradient_checkpointing_ln: false |
| gradient_checkpointing_mlp: false |
| gradient_checkpointing_start_layers: 0 |
| tie_weight: false |
| pad_idx: 1 |
| use_ft_flash_attn: false |
| use_ft_linear: false |
| use_ft_layernorm: false |
| use_xperf_rotary: false |
| use_rmpad: true |
| fuse_gelu_gemm: false |
| pad_output: false |
| position_embeddings_type: rope |
| skip_n_iters: -1 |
| n_shared_qhead: 4 |
| num_q_heads: -1 |
| num_kv_heads: -1 |
| head_dim: -1 |
| kv_mirror_layers: [] |
| kv_mirror_imitated_layers: [] |
| residual_post_ln_layers: [] |
| hyperconnection_rate: -1 |
| repeat_kv_heads: true |
| sparse_attention_window_size: |
| - -1 |
| use_query_swiglu: false |
| query_swiglu_inner_dim: 8192 |
| force_mem_efficient_layers: |
| - -1 |
| noop_transformer_layers: [] |
| dense_ffn_layers: [] |
| dense_ffn_type: swiglu |
| dense_ffn_inner_dim: -1 |
| moe_expert_type: exp-xelego |
| moe_gate_type: caplog-lego |
| moe_gate_metric_type: lego |
| moe_expert_exp_level: 4 |
| moe_expert_exp_first_dim_factor: 1.0 |
| moe_expert_exp_first_num: 2 |
| moe_topk: 5 |
| moe_num_expert: 0 |
| moe_expert_eq_dim_factor: 0.25 |
| moe_backend: default |
| moe_overlap_recomp_grad_comm: false |
| moe_expert_op_version: V1 |
| moe_aux_loss_weight: 0.001 |
| moe_gate_dropout: 0.0 |
| moe_use_balance: false |
| moe_expert_group_capacity: 1.0 |
| moe_expert_group_balance_loss_weight: 0.0 |
| moe_expert_groups_in_ep_rank: 1 |
| moe_enable_warmup: false |
| moe_swiglu_fc1_2_init_scale: 1.0 |
| janus_use_big_op: false |
| janus_big_op_version: V1 |
| janus_big_op_attn_grad_accum_fusion: true |
| convert_gate_to_fp32: false |
| moe_enable_ema_update: 1 |
| query_head_scale_factor: 1 |
| value_moe_num_expert: 0 |
| value_moe_qkv_topk: 4 |
| value_moe_qkv_times: 1 |
| value_moe_is_repeat: true |
| value_moe_expert_type: linear-lego |
| moe_pr_scale_factor: 1.0 |
| moe_pr_expert_type: disabled |
| value_moe_gate_type: default-lego |
| value_moe_gate_metric_type: default |
| lora_rank: 0 |
| save_mixed_ckpt_in_shards: false |
| save_mixed_model_states_freq: final |
| cont_train_mode: default |
| fuse_lora_weight: true |
| rope_mode: default |
| rope_scale: 1 |
| rope_base: 500000.0 |
| rope_cut: false |
| rope_cut_head_dim: 0 |
| rope_force_fp32: false |
| sparse_attention_window_scale: 1 |
| sparse_attention_global_window_size: |
| - 0 |
| use_attention_bias: false |
| layer_norm_type: rmsnorm_torch |
| exact_token_as_loss_denominator: false |
| use_key_layernorm: false |
| key_norm_after_rope: false |
| use_query_layernorm: false |
| use_context_groupnorm: false |
| use_mariana_gqa_pattern: false |
| use_sequence_parallel_attention: false |
| use_sequence_parallel_attention_a2a: false |
| context_parallel_use_all_gather: false |
| fp8_use_bf16_layers: '' |
| deterministic_mode: false |
| megatron_tensor_parallel_size: 8 |
| megatron_pipeline_parallel_size: 1 |
| megatron_context_parallel_size: 1 |
| megatron_expert_parallel_size: 1 |
| megatron_expert_parallel_size_in_dp: 1 |
| megatron_context_parallel_query_only: false |
| megatron_num_layers_per_virtual_pipeline_stage: 0 |
| megatron_micro_batch_size: 1 |
| megatron_global_batch_size: 256 |
| megatron_sequence_parallel: true |
| megatron_recompute_granularity: '' |
| megatron_use_flash_attention: true |
| megatron_recompute_method: uniform |
| megatron_recompute_num_layers: 1 |
| megatron_distribute_saved_activations: false |
| megatron_enable_distributed_optimizer: true |
| megatron_use_multi_precision_ddp: false |
| megatron_sequence_parallel_as_data_parallel_in_optimizer: false |
| megatron_gather_params_use_alltoall: false |
| megatron_enable_initial_jit_warmup: true |
| megatron_accumulate_allreduce_grads_in_fp32: true |
| megatron_bf16_use_bf16_allreduce_grads: false |
| megatron_grad_comm_type: '' |
| megatron_reduce_grads_use_alltoall: false |
| megatron_scale_loss_in_gradient: false |
| megatron_scale_gradient_after_allreduce: false |
| megatron_ddp_impl: local |
| megatron_bf16_qt: false |
| megatron_empty_cache_level: 0 |
| megatron_force_fp32_embed: false |
| megatron_deterministic_flash_attn: false |
| megatron_switch_pp_and_dp: false |
| megatron_timing_log_level: 2 |
| megatron_no_load_rng: false |
| megatron_no_save_rng: false |
| megatron_no_load_optim: false |
| megatron_mem_efficient_column_parallel: true |
| megatron_masked_softmax_fusion: true |
| megatron_bias_gelu_fusion: false |
| megatron_bias_dropout_fusion: false |
| megatron_gradient_accumulation_fusion: true |
| megatron_overlap_p2p_comm: false |
| megatron_deallocate_pipeline_outputs: true |
| megatron_timing_log_option: local |
| megatron_barrier_with_L1_time: false |
| megatron_strict_align_diff_with_ds: false |
| megatron_parallel_linear_force_weight_contiguous: false |
| megatron_use_mariana_softmax: false |
| megatron_use_mariana_activation: false |
| megatron_overlap_data_parallel_communication: false |
| megatron_overlap_dp_grad_comm: false |
| megatron_overlap_dp_param_comm: false |
| megatron_early_prefetch_dp_allgather: true |
| megatron_use_non_sequential_block: false |
| megatron_overlap_attn_grad_input_comm: true |
| megatron_sequence_data_parallel_size: -1 |
| megatron_distributed_sequence_parallel_size: -1 |
| megatron_num_layers_for_pipeline_stages: [] |
| megatron_vocab_parallel_embedding_fusion: false |
| megatron_embedding_reduce_scatter_for_sp: true |
| megatron_print_args: true |
| megatron_grad_norm_skip: -1.0 |
| megatron_reorder_wgrad: false |
| megatron_offload_activations: false |
| megatron_offload_ratio: 1.0 |
| megatron_offload_launch_ratio: 1.0 |
| megatron_optimizer_offload_main_param: false |
| megatron_data_parallel_random_init: false |
| megatron_pipeline_strategy: '' |
| megatron_pipeline_wgrad_strategy: '' |
| megatron_pipeline_warmup_overlap: false |
| megatron_allow_transformer_engine: false |
| megatron_fp8_e4m3: false |
| megatron_fp8_hybrid: false |
| megatron_fp8_wgrad: true |
| megatron_fp8_dgrad: true |
| megatron_fp8_margin: 0 |
| megatron_fp8_interval: 1 |
| megatron_transformer_impl: local |
| megatron_fp8_amax_history_len: 1024 |
| megatron_fp8_amax_compute_algo: max |
| megatron_use_qlora: false |
| megatron_qlora_quant_weight_dtype: null |
| megatron_qlora_quant_real_store: false |
| megatron_qlora_quant_groupsize: -1 |
| megatron_qlora_quant_input_dtype: '' |
| megatron_qlora_quant_aware_lora: false |
| megatron_qlora_quant_aware_L4Q: false |
| megatron_terapipe_nano_batch_size: -1 |
| lora_config: |
| default: |
| lora_dropout: 0.0 |
| lora_rank: 64 |
| layers: |
| - all |
| init_method: normal |
| init_mode: nonzero_parallel_init |
| init_kwargs: {} |
| lora_alpha: 2.0 |
| use_rslora: true |
| lora_experts_appr: full |
| use_qlora: false |
| qlora_quant_weight_dtype: null |
| qlora_quant_real_store: false |
| qlora_quant_aware_L4Q: false |
| qlora_quant_groupsize: -1 |
| qlora_quant_input_dtype: None |
| qlora_quant_aware_lora: false |
| post_training_quant: false |
| fully_sharded: false |
| emb_trainable: true |
| target_modules: |
| - query_key_value |
| - experts |
| - dense |
| query_key_value: |
| lora_rank: -1 |
| lora_alpha: -1.0 |
| experts: |
| lora_rank: -1 |
| lora_alpha: -1.0 |
| dense: |
| lora_rank: -1 |
| lora_alpha: -1.0 |
| dense_h_to_4h: |
| lora_rank: -1 |
| lora_alpha: -1.0 |
| dense_4h_to_h: |
| lora_rank: -1 |
| lora_alpha: -1.0 |
| freeze_prefix: null |
| partial_pretrain: null |
| partial_pretrain_rename: null |
| reset_global_step: -1 |
| override_lr_scheduler: true |
| start_debug_server: false |
| clip_token_ids: false |
| data: |
| train_path: hdfs://haruna/home/byte_data_seed/hdd_hldy/seed_code_seekpath/pretrained_yaml_new/V1_longct_datacard_hdfs_new_stage2_code_ct_fim_2.yaml |
| val_path: |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D73_val_20240507_2_200M_token_plain_source_v2_1_part |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/human_all_lite |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/autoeval_code_val_lite |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_20240412_ceval_1_part |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/merged_few_benchmark_datasets_20240705_1_part |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/pretrain_auto_eval_merged_all_v0.3_1_part |
| - hdfs://haruna/home/byte_data_seed/ssd_hldy/p0/D73_release/val/D74_val_20240621_200M_token_tok643_sa8192_plain_source_v2_1_part_dir |
| train_size: 5000000000000 |
| val_size: -1 |
| train_batch_size: 32 |
| train_num_workers: 4 |
| val_batch_size: -1 |
| val_num_workers: 1 |
| max_seq_len: 32768 |
| val_max_seq_len: -1 |
| text_keys: |
| - content_split |
| tokenizer: hdfs://haruna/home/byte_data_seed/hl_lq/seed_code/liuyongfei/tokenizers/bbpe155k-v6.4.3-ml.pret |
| gpu_prefetch: false |
| cpu_prefetch: false |
| dyn_bsz: true |
| dyn_bsz_margin: 0.0 |
| stride: -1 |
| warmup_step_rate: -1.0 |
| tokenizer_type: bbpe |
| bsz_warmup: false |
| bsz_warmup_rate: 0.016 |
| return_source: true |
| synthetic_sample: false |
| synthetic_batch: false |
| seq_lens: null |
| seq_probs: null |
| enable_sampling_ratios: false |
| train_path_with_ratio: null |
| src_weights: null |
| parse_aug_data: false |
| loader_accumulate: -1 |
| bsz_warmup_warmup_step_rate: 0.002 |
| max_epochs: 1 |
| pad_idx: 1 |
| strategy: megatron |
| megatron_micro_batch_size: 1 |
| use_rmpad: true |
| hidden_size: -1 |
| megatron_sequence_parallel: false |
| max_position_embeddings: 2048 |
| position_embeddings_type: absolute |
| use_sequence_parallel_attention: false |
| use_sequence_parallel_attention_a2a: false |
| resume_ckpt_path: '' |
| val_override_est_steps: false |
| init_without_cli: true |
| rope_mode: default |
| rope_scale: 1 |
| rope_base: 500000.0 |
| rope_cut: false |
| rope_cut_head_dim: 0 |
| init_val_loader_worker_beforehand: false |
| megatron_global_batch_size: 1 |
| megatron_tensor_parallel_size: 1 |
| megatron_pipeline_parallel_size: 1 |
| n_head: 1 |
| log_level: INFO |
| val_only: false |
| merge_model_states: false |
| merge_ckpt_dtype: bf16 |
| merge_cache_dir: ./ |
| download_ckpt_in_shards: true |
| gc_interval: 50 |
| profiler_at_iter: -1 |
| timer_at_iter: -1 |
| profile_all_ranks: false |
| profile_ranks: [] |
| profile_every_n_steps: -1 |
| profiler_memory_at_iter: null |
| profile_max_preview_rank: 0 |
|
|