feb6_rl_sdf_rl / transformer_config.json
aptl26's picture
Add files using upload-large-folder tool
4a0cd13 verified
{
"tensor_model_parallel_size": 8,
"pipeline_model_parallel_comm_backend": null,
"pipeline_model_parallel_size": 1,
"virtual_pipeline_model_parallel_size": null,
"sequence_parallel": true,
"context_parallel_size": 2,
"hierarchical_context_parallel_sizes": null,
"expert_model_parallel_size": 1,
"expert_tensor_parallel_size": 8,
"moe_extended_tp": false,
"perform_initialization": true,
"use_cpu_initialization": false,
"fp16": false,
"bf16": true,
"params_dtype": "torch.bfloat16",
"timers": null,
"finalize_model_grads_func": null,
"grad_scale_func": null,
"no_sync_func": null,
"grad_sync_func": null,
"param_sync_func": null,
"deterministic_mode": false,
"enable_autocast": false,
"autocast_dtype": "torch.bfloat16",
"num_microbatches_with_partial_activation_checkpoints": null,
"gradient_accumulation_fusion": true,
"async_tensor_model_parallel_allreduce": false,
"use_te_rng_tracker": false,
"tp_comm_overlap": false,
"tp_comm_bulk_wgrad": true,
"tp_comm_bulk_dgrad": true,
"tp_comm_overlap_ag": true,
"tp_comm_overlap_rs": true,
"tp_comm_overlap_rs_dgrad": false,
"tp_comm_split_ag": true,
"tp_comm_atomic_ag": false,
"tp_comm_split_rs": true,
"tp_comm_atomic_rs": false,
"cross_entropy_loss_fusion": true,
"cross_entropy_fusion_impl": "native",
"tp_comm_overlap_disable_qkv": false,
"tp_comm_overlap_disable_fc1": false,
"tp_comm_bootstrap_backend": "nccl",
"overlap_moe_expert_parallel_comm": false,
"delay_wgrad_compute": false,
"pipeline_dtype": null,
"variable_seq_lengths": true,
"overlap_p2p_comm": false,
"batch_p2p_comm": true,
"batch_p2p_sync": true,
"use_ring_exchange_p2p": false,
"deallocate_pipeline_outputs": true,
"defer_embedding_wgrad_compute": false,
"wgrad_deferral_limit": 0,
"overlap_p2p_comm_warmup_flush": false,
"microbatch_group_size_per_vp_stage": 1,
"cpu_offloading": false,
"cpu_offloading_num_layers": 0,
"_cpu_offloading_context": null,
"cpu_offloading_activations": true,
"cpu_offloading_weights": false,
"cpu_offloading_double_buffering": false,
"barrier_with_L1_time": true,
"num_layers": 64,
"mtp_num_layers": null,
"mtp_loss_scaling_factor": null,
"num_layers_in_first_pipeline_stage": null,
"num_layers_in_last_pipeline_stage": null,
"pipeline_model_parallel_layout": null,
"account_for_embedding_in_pipeline_split": false,
"account_for_loss_in_pipeline_split": false,
"hidden_size": 5120,
"num_attention_heads": 64,
"attention_backend": "AttnBackend.flash",
"softmax_scale": null,
"softmax_type": "vanilla",
"num_query_groups": 8,
"ffn_hidden_size": 25600,
"kv_channels": 128,
"hidden_dropout": 0.0,
"attention_dropout": 0.0,
"fp32_residual_connection": false,
"apply_residual_connection_post_layernorm": false,
"layernorm_epsilon": 1e-06,
"layernorm_zero_centered_gamma": false,
"add_bias_linear": false,
"add_qkv_bias": false,
"gated_linear_unit": true,
"activation_func_fp8_input_store": false,
"glu_linear_offset": 0.0,
"activation_func_clamp_value": null,
"num_moe_experts": null,
"rotary_interleaved": false,
"window_size": null,
"window_attn_skip_freq": null,
"normalization": "RMSNorm",
"qk_layernorm": true,
"test_mode": false,
"calculate_per_token_loss": false,
"multi_latent_attention": false,
"no_rope_freq": null,
"moe_deepep_num_sms": 20,
"init_method_std": 0.02,
"embedding_init_method_std": 0.02,
"init_model_with_meta_device": false,
"apply_query_key_layer_scaling": false,
"attention_softmax_in_fp32": false,
"disable_bf16_reduced_precision_matmul": false,
"bias_activation_fusion": false,
"masked_softmax_fusion": true,
"persist_layer_norm": false,
"memory_efficient_layer_norm": false,
"bias_dropout_fusion": false,
"apply_rope_fusion": true,
"use_fused_weighted_squared_relu": false,
"fused_single_qkv_rope": false,
"recompute_granularity": null,
"recompute_method": null,
"recompute_num_layers": null,
"distribute_saved_activations": null,
"recompute_modules": [
"core_attn"
],
"fp8": null,
"fp8_recipe": "delayed",
"fp8_param": false,
"fp8_margin": 0,
"fp8_interval": 1,
"fp8_amax_history_len": 1,
"fp8_amax_compute_algo": "most_recent",
"fp8_wgrad": true,
"fp8_dot_product_attention": false,
"fp8_multi_head_attention": false,
"tp_only_amax_red": false,
"first_last_layers_bf16": false,
"num_layers_at_start_in_bf16": 1,
"num_layers_at_end_in_bf16": 1,
"use_kitchen": false,
"fp4": null,
"fp4_recipe": "nvfp4",
"fp4_param": false,
"moe_shared_expert_intermediate_size": null,
"moe_shared_expert_overlap": false,
"moe_layer_freq": 1,
"moe_ffn_hidden_size": null,
"moe_router_load_balancing_type": "none",
"moe_router_topk": 2,
"moe_router_topk_limited_devices": null,
"moe_router_padding_for_fp8": false,
"moe_router_num_groups": null,
"moe_router_group_topk": null,
"moe_router_pre_softmax": false,
"moe_router_topk_scaling_factor": null,
"moe_router_score_function": "softmax",
"moe_router_dtype": null,
"moe_router_enable_expert_bias": false,
"moe_router_bias_update_rate": 0.001,
"moe_router_force_load_balancing": false,
"moe_grouped_gemm": false,
"moe_use_legacy_grouped_gemm": false,
"moe_aux_loss_coeff": 0.0,
"moe_z_loss_coeff": null,
"moe_input_jitter_eps": null,
"moe_token_dropping": false,
"moe_token_dispatcher_type": "alltoall",
"moe_enable_deepep": false,
"moe_per_layer_logging": false,
"moe_expert_capacity_factor": null,
"moe_pad_expert_input_to_capacity": false,
"moe_token_drop_policy": "probs",
"moe_layer_recompute": false,
"moe_permute_fusion": false,
"moe_router_fusion": false,
"moe_apply_probs_on_input": false,
"cp_comm_type": null,
"enable_cuda_graph": false,
"cuda_graph_use_single_mempool": false,
"cuda_graph_retain_backward_graph": false,
"cuda_graph_warmup_steps": 3,
"external_cuda_graph": false,
"cuda_graph_impl": "none",
"cuda_graph_scope": "full",
"clone_scatter_output_in_embedding": true,
"disable_parameter_transpose_cache": false,
"config_logger_dir": "",
"flash_decode": false,
"use_te_activation_func": false,
"inference_rng_tracker": false,
"inference_sampling_seed": 42,
"symmetric_ar_type": null,
"mrope_section": null,
"is_hybrid_model": false,
"mamba_state_dim": 128,
"mamba_head_dim": 64,
"mamba_num_groups": 8,
"mamba_num_heads": null,
"use_mamba_mem_eff_path": true,
"mlp_chunks_for_prefill": 1,
"heterogeneous_block_specs": false,
"hetereogenous_dist_checkpoint": false,
"quant_recipe": null,
"transformer_impl": "transformer_engine",
"fp16_lm_cross_entropy": false,
"parallel_output": true,
"share_embeddings_and_output_weights": false,
"make_vocab_size_divisible_by": 128,
"position_embedding_type": "rope",
"rotary_base": 1000000,
"rotary_percent": 1.0,
"seq_len_interpolation_factor": null,
"seq_length": 40960,
"scatter_embedding_sequence_parallel": true,
"tp_comm_overlap_cfg": null,
"use_transformer_engine_full_layer_spec": false,
"use_transformer_engine_op_fuser": false,
"hf_model_id": "/data/checkpoints/megatron/round4_2400_distckpt_allOBO/global_step_45/actor/huggingface",
"generation_config": null,
"vocab_size": 151936,
"should_pad_vocab": false,
"mtp_enabled": false,
"restore_modelopt_state": false,
"max_position_embeddings": 40960
}