{
  "tensor_model_parallel_size": 8,
  "pipeline_model_parallel_comm_backend": null,
  "pipeline_model_parallel_size": 1,
  "virtual_pipeline_model_parallel_size": null,
  "sequence_parallel": true,
  "context_parallel_size": 2,
  "hierarchical_context_parallel_sizes": null,
  "expert_model_parallel_size": 1,
  "expert_tensor_parallel_size": 8,
  "moe_extended_tp": false,
  "perform_initialization": true,
  "use_cpu_initialization": false,
  "fp16": false,
  "bf16": true,
  "params_dtype": "torch.bfloat16",
  "timers": null,
  "finalize_model_grads_func": null,
  "grad_scale_func": null,
  "no_sync_func": null,
  "grad_sync_func": null,
  "param_sync_func": null,
  "deterministic_mode": false,
  "enable_autocast": false,
  "autocast_dtype": "torch.bfloat16",
  "num_microbatches_with_partial_activation_checkpoints": null,
  "gradient_accumulation_fusion": true,
  "async_tensor_model_parallel_allreduce": false,
  "use_te_rng_tracker": false,
  "tp_comm_overlap": false,
  "tp_comm_bulk_wgrad": true,
  "tp_comm_bulk_dgrad": true,
  "tp_comm_overlap_ag": true,
  "tp_comm_overlap_rs": true,
  "tp_comm_overlap_rs_dgrad": false,
  "tp_comm_split_ag": true,
  "tp_comm_atomic_ag": false,
  "tp_comm_split_rs": true,
  "tp_comm_atomic_rs": false,
  "cross_entropy_loss_fusion": true,
  "cross_entropy_fusion_impl": "native",
  "tp_comm_overlap_disable_qkv": false,
  "tp_comm_overlap_disable_fc1": false,
  "tp_comm_bootstrap_backend": "nccl",
  "overlap_moe_expert_parallel_comm": false,
  "delay_wgrad_compute": false,
  "pipeline_dtype": null,
  "variable_seq_lengths": true,
  "overlap_p2p_comm": false,
  "batch_p2p_comm": true,
  "batch_p2p_sync": true,
  "use_ring_exchange_p2p": false,
  "deallocate_pipeline_outputs": true,
  "defer_embedding_wgrad_compute": false,
  "wgrad_deferral_limit": 0,
  "overlap_p2p_comm_warmup_flush": false,
  "microbatch_group_size_per_vp_stage": 1,
  "cpu_offloading": false,
  "cpu_offloading_num_layers": 0,
  "_cpu_offloading_context": null,
  "cpu_offloading_activations": true,
  "cpu_offloading_weights": false,
  "cpu_offloading_double_buffering": false,
  "barrier_with_L1_time": true,
  "num_layers": 64,
  "mtp_num_layers": null,
  "mtp_loss_scaling_factor": null,
  "num_layers_in_first_pipeline_stage": null,
  "num_layers_in_last_pipeline_stage": null,
  "pipeline_model_parallel_layout": null,
  "account_for_embedding_in_pipeline_split": false,
  "account_for_loss_in_pipeline_split": false,
  "hidden_size": 5120,
  "num_attention_heads": 64,
  "attention_backend": "AttnBackend.flash",
  "softmax_scale": null,
  "softmax_type": "vanilla",
  "num_query_groups": 8,
  "ffn_hidden_size": 25600,
  "kv_channels": 128,
  "hidden_dropout": 0.0,
  "attention_dropout": 0.0,
  "fp32_residual_connection": false,
  "apply_residual_connection_post_layernorm": false,
  "layernorm_epsilon": 1e-06,
  "layernorm_zero_centered_gamma": false,
  "add_bias_linear": false,
  "add_qkv_bias": false,
  "gated_linear_unit": true,
  "activation_func_fp8_input_store": false,
  "glu_linear_offset": 0.0,
  "activation_func_clamp_value": null,
  "num_moe_experts": null,
  "rotary_interleaved": false,
  "window_size": null,
  "window_attn_skip_freq": null,
  "normalization": "RMSNorm",
  "qk_layernorm": true,
  "test_mode": false,
  "calculate_per_token_loss": false,
  "multi_latent_attention": false,
  "no_rope_freq": null,
  "moe_deepep_num_sms": 20,
  "init_method_std": 0.02,
  "embedding_init_method_std": 0.02,
  "init_model_with_meta_device": false,
  "apply_query_key_layer_scaling": false,
  "attention_softmax_in_fp32": false,
  "disable_bf16_reduced_precision_matmul": false,
  "bias_activation_fusion": false,
  "masked_softmax_fusion": true,
  "persist_layer_norm": false,
  "memory_efficient_layer_norm": false,
  "bias_dropout_fusion": false,
  "apply_rope_fusion": true,
  "use_fused_weighted_squared_relu": false,
  "fused_single_qkv_rope": false,
  "recompute_granularity": null,
  "recompute_method": null,
  "recompute_num_layers": null,
  "distribute_saved_activations": null,
  "recompute_modules": [
    "core_attn"
  ],
  "fp8": null,
  "fp8_recipe": "delayed",
  "fp8_param": false,
  "fp8_margin": 0,
  "fp8_interval": 1,
  "fp8_amax_history_len": 1,
  "fp8_amax_compute_algo": "most_recent",
  "fp8_wgrad": true,
  "fp8_dot_product_attention": false,
  "fp8_multi_head_attention": false,
  "tp_only_amax_red": false,
  "first_last_layers_bf16": false,
  "num_layers_at_start_in_bf16": 1,
  "num_layers_at_end_in_bf16": 1,
  "use_kitchen": false,
  "fp4": null,
  "fp4_recipe": "nvfp4",
  "fp4_param": false,
  "moe_shared_expert_intermediate_size": null,
  "moe_shared_expert_overlap": false,
  "moe_layer_freq": 1,
  "moe_ffn_hidden_size": null,
  "moe_router_load_balancing_type": "none",
  "moe_router_topk": 2,
  "moe_router_topk_limited_devices": null,
  "moe_router_padding_for_fp8": false,
  "moe_router_num_groups": null,
  "moe_router_group_topk": null,
  "moe_router_pre_softmax": false,
  "moe_router_topk_scaling_factor": null,
  "moe_router_score_function": "softmax",
  "moe_router_dtype": null,
  "moe_router_enable_expert_bias": false,
  "moe_router_bias_update_rate": 0.001,
  "moe_router_force_load_balancing": false,
  "moe_grouped_gemm": false,
  "moe_use_legacy_grouped_gemm": false,
  "moe_aux_loss_coeff": 0.0,
  "moe_z_loss_coeff": null,
  "moe_input_jitter_eps": null,
  "moe_token_dropping": false,
  "moe_token_dispatcher_type": "alltoall",
  "moe_enable_deepep": false,
  "moe_per_layer_logging": false,
  "moe_expert_capacity_factor": null,
  "moe_pad_expert_input_to_capacity": false,
  "moe_token_drop_policy": "probs",
  "moe_layer_recompute": false,
  "moe_permute_fusion": false,
  "moe_router_fusion": false,
  "moe_apply_probs_on_input": false,
  "cp_comm_type": null,
  "enable_cuda_graph": false,
  "cuda_graph_use_single_mempool": false,
  "cuda_graph_retain_backward_graph": false,
  "cuda_graph_warmup_steps": 3,
  "external_cuda_graph": false,
  "cuda_graph_impl": "none",
  "cuda_graph_scope": "full",
  "clone_scatter_output_in_embedding": true,
  "disable_parameter_transpose_cache": false,
  "config_logger_dir": "",
  "flash_decode": false,
  "use_te_activation_func": false,
  "inference_rng_tracker": false,
  "inference_sampling_seed": 42,
  "symmetric_ar_type": null,
  "mrope_section": null,
  "is_hybrid_model": false,
  "mamba_state_dim": 128,
  "mamba_head_dim": 64,
  "mamba_num_groups": 8,
  "mamba_num_heads": null,
  "use_mamba_mem_eff_path": true,
  "mlp_chunks_for_prefill": 1,
  "heterogeneous_block_specs": false,
  "hetereogenous_dist_checkpoint": false,
  "quant_recipe": null,
  "transformer_impl": "transformer_engine",
  "fp16_lm_cross_entropy": false,
  "parallel_output": true,
  "share_embeddings_and_output_weights": false,
  "make_vocab_size_divisible_by": 128,
  "position_embedding_type": "rope",
  "rotary_base": 1000000,
  "rotary_percent": 1.0,
  "seq_len_interpolation_factor": null,
  "seq_length": 40960,
  "scatter_embedding_sequence_parallel": true,
  "tp_comm_overlap_cfg": null,
  "use_transformer_engine_full_layer_spec": false,
  "use_transformer_engine_op_fuser": false,
  "hf_model_id": "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface",
  "generation_config": null,
  "vocab_size": 151936,
  "should_pad_vocab": false,
  "mtp_enabled": false,
  "restore_modelopt_state": false,
  "max_position_embeddings": 40960
}