wangrongsheng commited on
Commit
c4471ea
·
verified ·
1 Parent(s): 10c9876

fix config

Browse files
Files changed (1) hide show
  1. config.json +11 -10
config.json CHANGED
@@ -8,25 +8,26 @@
8
  "AutoModel": "modeling_openpangu_moe.PanguUltraMoEModel",
9
  "AutoModelForCausalLM": "modeling_openpangu_moe.PanguUltraMoEForCausalLM"
10
  },
11
- "num_dense_layers": 3,
12
  "hidden_act": "silu",
13
  "hidden_size": 7680,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 18432,
16
- "attention_kv_lora_dim": 512,
17
  "max_position_embeddings": 131072,
18
  "model_type": "pangu_ultra_moe",
19
  "moe_intermediate_size": 2048,
20
- "num_routed_experts": 256,
21
- "num_shared_experts": 1,
 
22
  "num_attention_heads": 128,
23
  "num_experts_per_tok": 8,
24
  "num_hidden_layers": 61,
25
  "num_key_value_heads": 128,
26
- "num_mtp_layers": 1,
27
- "attention_q_lora_dim": 1536,
28
- "attention_qk_dim": 128,
29
- "attention_qk_rope_dim": 64,
30
  "rms_norm_eps": 1e-05,
31
  "rope_theta": 25600000,
32
  "routed_scaling_factor": 2.5,
@@ -35,6 +36,6 @@
35
  "torch_dtype": "bfloat16",
36
  "transformers_version": "4.48.2",
37
  "use_cache": true,
38
- "attention_v_dim": 128,
39
  "vocab_size": 153600
40
- }
 
8
  "AutoModel": "modeling_openpangu_moe.PanguUltraMoEModel",
9
  "AutoModelForCausalLM": "modeling_openpangu_moe.PanguUltraMoEForCausalLM"
10
  },
11
+ "first_k_dense_replace": 3,
12
  "hidden_act": "silu",
13
  "hidden_size": 7680,
14
  "initializer_range": 0.02,
15
  "intermediate_size": 18432,
16
+ "kv_lora_rank": 512,
17
  "max_position_embeddings": 131072,
18
  "model_type": "pangu_ultra_moe",
19
  "moe_intermediate_size": 2048,
20
+ "n_routed_experts": 256,
21
+ "n_shared_experts": 1,
22
+ "norm_topk_prob": true,
23
  "num_attention_heads": 128,
24
  "num_experts_per_tok": 8,
25
  "num_hidden_layers": 61,
26
  "num_key_value_heads": 128,
27
+ "num_nextn_predict_layers": 1,
28
+ "q_lora_rank": 1536,
29
+ "qk_nope_head_dim": 128,
30
+ "qk_rope_head_dim": 64,
31
  "rms_norm_eps": 1e-05,
32
  "rope_theta": 25600000,
33
  "routed_scaling_factor": 2.5,
 
36
  "torch_dtype": "bfloat16",
37
  "transformers_version": "4.48.2",
38
  "use_cache": true,
39
+ "v_head_dim": 128,
40
  "vocab_size": 153600
41
+ }