{ "tensor_model_parallel_size": 8, "pipeline_model_parallel_comm_backend": null, "pipeline_model_parallel_size": 1, "virtual_pipeline_model_parallel_size": null, "sequence_parallel": true, "context_parallel_size": 2, "hierarchical_context_parallel_sizes": null, "expert_model_parallel_size": 1, "expert_tensor_parallel_size": 8, "moe_extended_tp": false, "perform_initialization": true, "use_cpu_initialization": false, "fp16": false, "bf16": true, "params_dtype": "torch.bfloat16", "timers": null, "finalize_model_grads_func": null, "grad_scale_func": null, "no_sync_func": null, "grad_sync_func": null, "param_sync_func": null, "deterministic_mode": false, "enable_autocast": false, "autocast_dtype": "torch.bfloat16", "num_microbatches_with_partial_activation_checkpoints": null, "gradient_accumulation_fusion": true, "async_tensor_model_parallel_allreduce": false, "use_te_rng_tracker": false, "tp_comm_overlap": false, "tp_comm_bulk_wgrad": true, "tp_comm_bulk_dgrad": true, "tp_comm_overlap_ag": true, "tp_comm_overlap_rs": true, "tp_comm_overlap_rs_dgrad": false, "tp_comm_split_ag": true, "tp_comm_atomic_ag": false, "tp_comm_split_rs": true, "tp_comm_atomic_rs": false, "cross_entropy_loss_fusion": true, "cross_entropy_fusion_impl": "native", "tp_comm_overlap_disable_qkv": false, "tp_comm_overlap_disable_fc1": false, "tp_comm_bootstrap_backend": "nccl", "overlap_moe_expert_parallel_comm": false, "delay_wgrad_compute": false, "pipeline_dtype": null, "variable_seq_lengths": true, "overlap_p2p_comm": false, "batch_p2p_comm": true, "batch_p2p_sync": true, "use_ring_exchange_p2p": false, "deallocate_pipeline_outputs": true, "defer_embedding_wgrad_compute": false, "wgrad_deferral_limit": 0, "overlap_p2p_comm_warmup_flush": false, "microbatch_group_size_per_vp_stage": 1, "cpu_offloading": false, "cpu_offloading_num_layers": 0, "_cpu_offloading_context": null, "cpu_offloading_activations": true, "cpu_offloading_weights": false, "cpu_offloading_double_buffering": false, "barrier_with_L1_time": true, "num_layers": 64, "mtp_num_layers": null, "mtp_loss_scaling_factor": null, "num_layers_in_first_pipeline_stage": null, "num_layers_in_last_pipeline_stage": null, "pipeline_model_parallel_layout": null, "account_for_embedding_in_pipeline_split": false, "account_for_loss_in_pipeline_split": false, "hidden_size": 5120, "num_attention_heads": 64, "attention_backend": "AttnBackend.flash", "softmax_scale": null, "softmax_type": "vanilla", "num_query_groups": 8, "ffn_hidden_size": 25600, "kv_channels": 128, "hidden_dropout": 0.0, "attention_dropout": 0.0, "fp32_residual_connection": false, "apply_residual_connection_post_layernorm": false, "layernorm_epsilon": 1e-06, "layernorm_zero_centered_gamma": false, "add_bias_linear": false, "add_qkv_bias": false, "gated_linear_unit": true, "activation_func_fp8_input_store": false, "glu_linear_offset": 0.0, "activation_func_clamp_value": null, "num_moe_experts": null, "rotary_interleaved": false, "window_size": null, "window_attn_skip_freq": null, "normalization": "RMSNorm", "qk_layernorm": true, "test_mode": false, "calculate_per_token_loss": false, "multi_latent_attention": false, "no_rope_freq": null, "moe_deepep_num_sms": 20, "init_method_std": 0.02, "embedding_init_method_std": 0.02, "init_model_with_meta_device": false, "apply_query_key_layer_scaling": false, "attention_softmax_in_fp32": false, "disable_bf16_reduced_precision_matmul": false, "bias_activation_fusion": false, "masked_softmax_fusion": true, "persist_layer_norm": false, "memory_efficient_layer_norm": false, "bias_dropout_fusion": false, "apply_rope_fusion": true, "use_fused_weighted_squared_relu": false, "fused_single_qkv_rope": false, "recompute_granularity": null, "recompute_method": null, "recompute_num_layers": null, "distribute_saved_activations": null, "recompute_modules": [ "core_attn" ], "fp8": null, "fp8_recipe": "delayed", "fp8_param": false, "fp8_margin": 0, "fp8_interval": 1, "fp8_amax_history_len": 1, "fp8_amax_compute_algo": "most_recent", "fp8_wgrad": true, "fp8_dot_product_attention": false, "fp8_multi_head_attention": false, "tp_only_amax_red": false, "first_last_layers_bf16": false, "num_layers_at_start_in_bf16": 1, "num_layers_at_end_in_bf16": 1, "use_kitchen": false, "fp4": null, "fp4_recipe": "nvfp4", "fp4_param": false, "moe_shared_expert_intermediate_size": null, "moe_shared_expert_overlap": false, "moe_layer_freq": 1, "moe_ffn_hidden_size": null, "moe_router_load_balancing_type": "none", "moe_router_topk": 2, "moe_router_topk_limited_devices": null, "moe_router_padding_for_fp8": false, "moe_router_num_groups": null, "moe_router_group_topk": null, "moe_router_pre_softmax": false, "moe_router_topk_scaling_factor": null, "moe_router_score_function": "softmax", "moe_router_dtype": null, "moe_router_enable_expert_bias": false, "moe_router_bias_update_rate": 0.001, "moe_router_force_load_balancing": false, "moe_grouped_gemm": false, "moe_use_legacy_grouped_gemm": false, "moe_aux_loss_coeff": 0.0, "moe_z_loss_coeff": null, "moe_input_jitter_eps": null, "moe_token_dropping": false, "moe_token_dispatcher_type": "alltoall", "moe_enable_deepep": false, "moe_per_layer_logging": false, "moe_expert_capacity_factor": null, "moe_pad_expert_input_to_capacity": false, "moe_token_drop_policy": "probs", "moe_layer_recompute": false, "moe_permute_fusion": false, "moe_router_fusion": false, "moe_apply_probs_on_input": false, "cp_comm_type": null, "enable_cuda_graph": false, "cuda_graph_use_single_mempool": false, "cuda_graph_retain_backward_graph": false, "cuda_graph_warmup_steps": 3, "external_cuda_graph": false, "cuda_graph_impl": "none", "cuda_graph_scope": "full", "clone_scatter_output_in_embedding": true, "disable_parameter_transpose_cache": false, "config_logger_dir": "", "flash_decode": false, "use_te_activation_func": false, "inference_rng_tracker": false, "inference_sampling_seed": 42, "symmetric_ar_type": null, "mrope_section": null, "is_hybrid_model": false, "mamba_state_dim": 128, "mamba_head_dim": 64, "mamba_num_groups": 8, "mamba_num_heads": null, "use_mamba_mem_eff_path": true, "mlp_chunks_for_prefill": 1, "heterogeneous_block_specs": false, "hetereogenous_dist_checkpoint": false, "quant_recipe": null, "transformer_impl": "transformer_engine", "fp16_lm_cross_entropy": false, "parallel_output": true, "share_embeddings_and_output_weights": false, "make_vocab_size_divisible_by": 128, "position_embedding_type": "rope", "rotary_base": 1000000, "rotary_percent": 1.0, "seq_len_interpolation_factor": null, "seq_length": 40960, "scatter_embedding_sequence_parallel": true, "tp_comm_overlap_cfg": null, "use_transformer_engine_full_layer_spec": false, "use_transformer_engine_op_fuser": false, "hf_model_id": "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface", "generation_config": null, "vocab_size": 151936, "should_pad_vocab": false, "mtp_enabled": false, "restore_modelopt_state": false, "max_position_embeddings": 40960 }