ernie-4.5-vl-moe / config.json
yujiepan's picture
Upload folder using huggingface_hub
78fbaa9 verified
{
"add_tail_layers": false,
"architectures": [
"Ernie4_5_VLMoeForConditionalGeneration"
],
"attention_probs_dropout_prob": 0.0,
"auto_map": {
"AutoConfig": "baidu/ERNIE-4.5-VL-424B-A47B-PT--configuration_ernie4_5_vl.Ernie4_5_VLMoEConfig",
"AutoImageProcessor": "baidu/ERNIE-4.5-VL-424B-A47B-PT--processing_ernie4_5_vl.Ernie4_5_VLImageProcessor",
"AutoModel": "baidu/ERNIE-4.5-VL-424B-A47B-PT--modeling_ernie4_5_vl.Ernie4_5_VLMoeForConditionalGeneration",
"AutoModelForCausalLM": "baidu/ERNIE-4.5-VL-424B-A47B-PT--modeling_ernie4_5_vl.Ernie4_5_VLMoeForConditionalGeneration",
"AutoProcessor": "baidu/ERNIE-4.5-VL-424B-A47B-PT--processing_ernie4_5_vl.Ernie4_5_VLProcessor"
},
"bos_token_id": 1,
"cachekv_quant": false,
"compression_ratio": 1.0,
"disable_ffn_model_parallel": false,
"dpo_config": null,
"dtype": "bfloat16",
"enable_delay_scale_loss": true,
"eos_token_id": 2,
"freq_allocation": 20,
"fuse_attn_ffn": true,
"fuse_gate_detach_matmul": false,
"fuse_linear": false,
"fuse_ln": false,
"fuse_rms_norm": false,
"fuse_rope": false,
"fuse_softmax_mask": false,
"fuse_swiglu": false,
"global_aux_loss": false,
"hidden_act": "silu",
"hidden_dropout_prob": 0.0,
"hidden_size": 8,
"ignored_index": -100,
"im_patch_id": 100295,
"initializer_range": 0.20411393876950196,
"intermediate_size": 32,
"loss_subbatch_seqlen": 8192,
"max_position_embeddings": 131072,
"max_sequence_length": null,
"max_text_id": null,
"micro_batch_size": -1,
"mm_vocab_size": 0,
"modality_detach": false,
"model_type": "ernie4_5_moe_vl",
"moe_all_to_all_dropout": 0.0,
"moe_aux_loss_lambda": 0.01,
"moe_capacity": [
64,
64,
64
],
"moe_dense_experts_token_type_id": 3,
"moe_dropout_prob": 0.0,
"moe_fuse_experts": false,
"moe_gate": "topk",
"moe_gate_act": "softmax",
"moe_group": "world",
"moe_group_experts": false,
"moe_group_orthogonal_loss": true,
"moe_intermediate_size": [
32,
32
],
"moe_k": 8,
"moe_layer_end_index": 1,
"moe_layer_feed_fake_token": false,
"moe_layer_interval": 1,
"moe_layer_start_index": 1,
"moe_multimodal_dispatch_use_allgather": "v2-alltoall-unpad-text",
"moe_norm_gate_logits": true,
"moe_num_attn_experts": false,
"moe_num_experts": [
32,
32
],
"moe_num_shared_experts": 0,
"moe_orthogonal_loss_lambda": 0.01,
"moe_reverse_token_drop": false,
"moe_use_aux_free": true,
"moe_use_hard_gate": true,
"moe_use_size_all2all": false,
"moe_use_token_type_bias": false,
"moe_z_loss_lambda": 0.0001,
"num_acc_steps": 1,
"num_attention_heads": 4,
"num_hidden_layers": 2,
"num_key_value_heads": 4,
"output_attentions": false,
"pad_token_id": 0,
"pixel_hidden_size": 16,
"pp_seg_method": "layer:ErnieDecoderLayer|EmptyLayer",
"recompute": false,
"recompute_granularity": "core_attn",
"recompute_use_reentrant": false,
"refined_recompute": {},
"resampler_fuse_rms_norm": false,
"rms_norm_eps": 1e-05,
"rope_3d": true,
"rope_scaling": {
"mrope_section": [
22,
22,
20
],
"type": "default"
},
"rope_theta": 500000,
"sinkhorn_2gate": true,
"sinkhorn_temp": 0.03,
"skip_recompute_ops": {},
"spatial_conv_size": 2,
"temporal_conv_size": 2,
"tensor_parallel_degree": 1,
"tie_word_embeddings": false,
"token_balance_loss": false,
"token_balance_seqlen": false,
"transformers_version": null,
"use_bias": false,
"use_cache": true,
"use_ep_comm_overlap": false,
"use_fast_ln": false,
"use_flash_attention": true,
"use_fused_head_and_loss_fn": false,
"use_recompute_lm_head": false,
"use_recompute_loss_fn": false,
"use_recompute_moe": false,
"use_recompute_resampler": false,
"use_rmsnorm": true,
"use_sparse_flash_attn": true,
"use_sparse_head_and_loss_fn": false,
"use_temporal_conv": true,
"use_var_len_flash_attn": false,
"using_precision_check": false,
"video_end_token_id": 101307,
"video_start_token_id": 101306,
"vision_config": {
"attn_implementation": "eager",
"attn_sep": true,
"depth": 2,
"embed_dim": 16,
"hidden_act": "quick_gelu",
"hidden_size": 16,
"in_channels": 3,
"in_chans": 3,
"mlp_ratio": 4,
"model_type": "DFNRope_vision_transformer",
"num_heads": 1,
"patch_size": 14,
"pp_data_balance": false,
"recompute": false,
"spatial_merge_size": 2,
"spatial_patch_size": 14,
"vit_first_fwd_bsz": 128,
"vit_num_recompute_layers": 10000
},
"vocab_size": 103424,
"weight_share_add_bias": true
}