| { | |
| "tensor_model_parallel_size": 8, | |
| "pipeline_model_parallel_comm_backend": null, | |
| "pipeline_model_parallel_size": 1, | |
| "virtual_pipeline_model_parallel_size": null, | |
| "sequence_parallel": true, | |
| "context_parallel_size": 2, | |
| "hierarchical_context_parallel_sizes": null, | |
| "expert_model_parallel_size": 1, | |
| "expert_tensor_parallel_size": 8, | |
| "moe_extended_tp": false, | |
| "perform_initialization": true, | |
| "use_cpu_initialization": false, | |
| "fp16": false, | |
| "bf16": true, | |
| "params_dtype": "torch.bfloat16", | |
| "timers": null, | |
| "finalize_model_grads_func": null, | |
| "grad_scale_func": null, | |
| "no_sync_func": null, | |
| "grad_sync_func": null, | |
| "param_sync_func": null, | |
| "deterministic_mode": false, | |
| "enable_autocast": false, | |
| "autocast_dtype": "torch.bfloat16", | |
| "num_microbatches_with_partial_activation_checkpoints": null, | |
| "gradient_accumulation_fusion": true, | |
| "async_tensor_model_parallel_allreduce": false, | |
| "use_te_rng_tracker": false, | |
| "tp_comm_overlap": false, | |
| "tp_comm_bulk_wgrad": true, | |
| "tp_comm_bulk_dgrad": true, | |
| "tp_comm_overlap_ag": true, | |
| "tp_comm_overlap_rs": true, | |
| "tp_comm_overlap_rs_dgrad": false, | |
| "tp_comm_split_ag": true, | |
| "tp_comm_atomic_ag": false, | |
| "tp_comm_split_rs": true, | |
| "tp_comm_atomic_rs": false, | |
| "cross_entropy_loss_fusion": true, | |
| "cross_entropy_fusion_impl": "native", | |
| "tp_comm_overlap_disable_qkv": false, | |
| "tp_comm_overlap_disable_fc1": false, | |
| "tp_comm_bootstrap_backend": "nccl", | |
| "overlap_moe_expert_parallel_comm": false, | |
| "delay_wgrad_compute": false, | |
| "pipeline_dtype": null, | |
| "variable_seq_lengths": true, | |
| "overlap_p2p_comm": false, | |
| "batch_p2p_comm": true, | |
| "batch_p2p_sync": true, | |
| "use_ring_exchange_p2p": false, | |
| "deallocate_pipeline_outputs": true, | |
| "defer_embedding_wgrad_compute": false, | |
| "wgrad_deferral_limit": 0, | |
| "overlap_p2p_comm_warmup_flush": false, | |
| "microbatch_group_size_per_vp_stage": 1, | |
| "cpu_offloading": false, | |
| "cpu_offloading_num_layers": 0, | |
| "_cpu_offloading_context": null, | |
| "cpu_offloading_activations": true, | |
| "cpu_offloading_weights": false, | |
| "cpu_offloading_double_buffering": false, | |
| "barrier_with_L1_time": true, | |
| "num_layers": 64, | |
| "mtp_num_layers": null, | |
| "mtp_loss_scaling_factor": null, | |
| "num_layers_in_first_pipeline_stage": null, | |
| "num_layers_in_last_pipeline_stage": null, | |
| "pipeline_model_parallel_layout": null, | |
| "account_for_embedding_in_pipeline_split": false, | |
| "account_for_loss_in_pipeline_split": false, | |
| "hidden_size": 5120, | |
| "num_attention_heads": 64, | |
| "attention_backend": "AttnBackend.flash", | |
| "softmax_scale": null, | |
| "softmax_type": "vanilla", | |
| "num_query_groups": 8, | |
| "ffn_hidden_size": 25600, | |
| "kv_channels": 128, | |
| "hidden_dropout": 0.0, | |
| "attention_dropout": 0.0, | |
| "fp32_residual_connection": false, | |
| "apply_residual_connection_post_layernorm": false, | |
| "layernorm_epsilon": 1e-06, | |
| "layernorm_zero_centered_gamma": false, | |
| "add_bias_linear": false, | |
| "add_qkv_bias": false, | |
| "gated_linear_unit": true, | |
| "activation_func_fp8_input_store": false, | |
| "glu_linear_offset": 0.0, | |
| "activation_func_clamp_value": null, | |
| "num_moe_experts": null, | |
| "rotary_interleaved": false, | |
| "window_size": null, | |
| "window_attn_skip_freq": null, | |
| "normalization": "RMSNorm", | |
| "qk_layernorm": true, | |
| "test_mode": false, | |
| "calculate_per_token_loss": false, | |
| "multi_latent_attention": false, | |
| "no_rope_freq": null, | |
| "moe_deepep_num_sms": 20, | |
| "init_method_std": 0.02, | |
| "embedding_init_method_std": 0.02, | |
| "init_model_with_meta_device": false, | |
| "apply_query_key_layer_scaling": false, | |
| "attention_softmax_in_fp32": false, | |
| "disable_bf16_reduced_precision_matmul": false, | |
| "bias_activation_fusion": false, | |
| "masked_softmax_fusion": true, | |
| "persist_layer_norm": false, | |
| "memory_efficient_layer_norm": false, | |
| "bias_dropout_fusion": false, | |
| "apply_rope_fusion": true, | |
| "use_fused_weighted_squared_relu": false, | |
| "fused_single_qkv_rope": false, | |
| "recompute_granularity": null, | |
| "recompute_method": null, | |
| "recompute_num_layers": null, | |
| "distribute_saved_activations": null, | |
| "recompute_modules": [ | |
| "core_attn" | |
| ], | |
| "fp8": null, | |
| "fp8_recipe": "delayed", | |
| "fp8_param": false, | |
| "fp8_margin": 0, | |
| "fp8_interval": 1, | |
| "fp8_amax_history_len": 1, | |
| "fp8_amax_compute_algo": "most_recent", | |
| "fp8_wgrad": true, | |
| "fp8_dot_product_attention": false, | |
| "fp8_multi_head_attention": false, | |
| "tp_only_amax_red": false, | |
| "first_last_layers_bf16": false, | |
| "num_layers_at_start_in_bf16": 1, | |
| "num_layers_at_end_in_bf16": 1, | |
| "use_kitchen": false, | |
| "fp4": null, | |
| "fp4_recipe": "nvfp4", | |
| "fp4_param": false, | |
| "moe_shared_expert_intermediate_size": null, | |
| "moe_shared_expert_overlap": false, | |
| "moe_layer_freq": 1, | |
| "moe_ffn_hidden_size": null, | |
| "moe_router_load_balancing_type": "none", | |
| "moe_router_topk": 2, | |
| "moe_router_topk_limited_devices": null, | |
| "moe_router_padding_for_fp8": false, | |
| "moe_router_num_groups": null, | |
| "moe_router_group_topk": null, | |
| "moe_router_pre_softmax": false, | |
| "moe_router_topk_scaling_factor": null, | |
| "moe_router_score_function": "softmax", | |
| "moe_router_dtype": null, | |
| "moe_router_enable_expert_bias": false, | |
| "moe_router_bias_update_rate": 0.001, | |
| "moe_router_force_load_balancing": false, | |
| "moe_grouped_gemm": false, | |
| "moe_use_legacy_grouped_gemm": false, | |
| "moe_aux_loss_coeff": 0.0, | |
| "moe_z_loss_coeff": null, | |
| "moe_input_jitter_eps": null, | |
| "moe_token_dropping": false, | |
| "moe_token_dispatcher_type": "alltoall", | |
| "moe_enable_deepep": false, | |
| "moe_per_layer_logging": false, | |
| "moe_expert_capacity_factor": null, | |
| "moe_pad_expert_input_to_capacity": false, | |
| "moe_token_drop_policy": "probs", | |
| "moe_layer_recompute": false, | |
| "moe_permute_fusion": false, | |
| "moe_router_fusion": false, | |
| "moe_apply_probs_on_input": false, | |
| "cp_comm_type": null, | |
| "enable_cuda_graph": false, | |
| "cuda_graph_use_single_mempool": false, | |
| "cuda_graph_retain_backward_graph": false, | |
| "cuda_graph_warmup_steps": 3, | |
| "external_cuda_graph": false, | |
| "cuda_graph_impl": "none", | |
| "cuda_graph_scope": "full", | |
| "clone_scatter_output_in_embedding": true, | |
| "disable_parameter_transpose_cache": false, | |
| "config_logger_dir": "", | |
| "flash_decode": false, | |
| "use_te_activation_func": false, | |
| "inference_rng_tracker": false, | |
| "inference_sampling_seed": 42, | |
| "symmetric_ar_type": null, | |
| "mrope_section": null, | |
| "is_hybrid_model": false, | |
| "mamba_state_dim": 128, | |
| "mamba_head_dim": 64, | |
| "mamba_num_groups": 8, | |
| "mamba_num_heads": null, | |
| "use_mamba_mem_eff_path": true, | |
| "mlp_chunks_for_prefill": 1, | |
| "heterogeneous_block_specs": false, | |
| "hetereogenous_dist_checkpoint": false, | |
| "quant_recipe": null, | |
| "transformer_impl": "transformer_engine", | |
| "fp16_lm_cross_entropy": false, | |
| "parallel_output": true, | |
| "share_embeddings_and_output_weights": false, | |
| "make_vocab_size_divisible_by": 128, | |
| "position_embedding_type": "rope", | |
| "rotary_base": 1000000, | |
| "rotary_percent": 1.0, | |
| "seq_len_interpolation_factor": null, | |
| "seq_length": 40960, | |
| "scatter_embedding_sequence_parallel": true, | |
| "tp_comm_overlap_cfg": null, | |
| "use_transformer_engine_full_layer_spec": false, | |
| "use_transformer_engine_op_fuser": false, | |
| "hf_model_id": "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface", | |
| "generation_config": null, | |
| "vocab_size": 151936, | |
| "should_pad_vocab": false, | |
| "mtp_enabled": false, | |
| "restore_modelopt_state": false, | |
| "max_position_embeddings": 40960 | |
| } |