| { | |
| "activation_type": "swiglu", | |
| "alibi": false, | |
| "alibi_bias_max": 8.0, | |
| "architectures": [ | |
| "AIGCodeXMoEForCausalLM" | |
| ], | |
| "attention_dropout": 0.0, | |
| "attention_layer_norm": false, | |
| "attention_layer_norm_with_affine": false, | |
| "batch_size": 4, | |
| "bias_for_layer_norm": false, | |
| "block_group_size": 1, | |
| "block_type": "sequential", | |
| "clip_qkv": null, | |
| "d_model": 4096, | |
| "deepnorm": false, | |
| "embedding_dropout": 0.0, | |
| "embedding_size": 65280, | |
| "encoder_decoder": false, | |
| "eos_token_id": 2, | |
| "eval_max_sequence_length": null, | |
| "exp_dim_ratio": 1, | |
| "flash_attention": false, | |
| "gate_level": "token", | |
| "gate_sample_ratio": 1, | |
| "gate_softmax_temperature": 8.0, | |
| "gshard": false, | |
| "include_bias": false, | |
| "init_cutoff_factor": null, | |
| "init_device": "meta", | |
| "init_fn": "normal", | |
| "init_std": 0.01, | |
| "intermediate_size": 16384, | |
| "latent_attention": false, | |
| "latent_attention_dim": 512, | |
| "layer_norm_eps": 1e-05, | |
| "layer_norm_type": "default", | |
| "layer_norm_with_affine": false, | |
| "layer_share": false, | |
| "layer_share_mlp_version": 1, | |
| "layer_std_check": false, | |
| "max_sequence_length": 4096, | |
| "mlp_hidden_size": null, | |
| "mlp_ratio": 4, | |
| "mobile_llm_repeat_num": 1, | |
| "model_type": "hf_aigcodexmoe", | |
| "moe_act_ckpt_ratio": 1, | |
| "moe_auxiliary_loss": false, | |
| "moe_auxiliary_loss_weight": 0.0, | |
| "moe_batch_prioritized_routing": false, | |
| "moe_eval_capacity_token_fraction": 0.25, | |
| "moe_expert_count": 4, | |
| "moe_expert_count_mluti_level": null, | |
| "moe_freq": 2, | |
| "moe_freq_pos": 0, | |
| "moe_gate_input_type": "concat", | |
| "moe_gate_loss_combine_method": "average", | |
| "moe_gate_loss_weight": 0.0, | |
| "moe_gate_no_grad": false, | |
| "moe_gating_use_fp32": true, | |
| "moe_logging": false, | |
| "moe_normalize_gate_prob_before_dropping": false, | |
| "moe_second_expert_policy": "sampling", | |
| "moe_share_expert_count": 0, | |
| "moe_top1_expert": true, | |
| "moe_topn_expert": 1, | |
| "moe_version": 1, | |
| "multi_query_attention": false, | |
| "n_heads": 32, | |
| "n_kv_heads": null, | |
| "n_layers": 22, | |
| "pad_token_id": 0, | |
| "ple_layer_num": 0, | |
| "ple_layernorm": false, | |
| "precision": "amp_bf16", | |
| "residual_dropout": 0.0, | |
| "rope": true, | |
| "rope_base": 30000, | |
| "rope_ext_ratio": 1, | |
| "rope_full_precision": true, | |
| "scale_logits": false, | |
| "sft_ans_mask": false, | |
| "share_layer_groups": 1, | |
| "share_moe_groups": 1, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.40.2", | |
| "use_cache": true, | |
| "use_mobile_llm": false, | |
| "use_moe": false, | |
| "use_ple": false, | |
| "use_xmoe": true, | |
| "vocab_size": 64003, | |
| "weight_tying": false | |
| } | |