jinliuxi commited on
Commit
e872976
·
verified ·
1 Parent(s): 128fdb8

Upload Qwen3NextForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +41 -27
  2. generation_config.json +5 -2
  3. model.safetensors +2 -2
config.json CHANGED
@@ -1,45 +1,59 @@
1
  {
2
  "architectures": [
3
- "DeepseekV3ForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
 
 
8
  "eos_token_id": 1,
9
- "first_k_dense_replace": 1,
10
- "head_dim": 64,
11
  "hidden_act": "silu",
12
- "hidden_size": 768,
13
  "initializer_range": 0.02,
14
- "intermediate_size": 2560,
15
- "kv_lora_rank": 64,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  "max_position_embeddings": 4096,
17
- "model_type": "deepseek_v3",
18
- "moe_intermediate_size": 512,
19
- "n_group": 8,
20
- "n_routed_experts": 16,
21
- "n_shared_experts": 1,
22
  "norm_topk_prob": true,
23
  "num_attention_heads": 8,
24
- "num_experts_per_tok": 4,
 
25
  "num_hidden_layers": 8,
26
- "num_key_value_heads": 8,
27
- "pretraining_tp": 1,
28
- "q_lora_rank": 256,
29
- "qk_head_dim": 192,
30
- "qk_nope_head_dim": 128,
31
- "qk_rope_head_dim": 64,
32
  "rms_norm_eps": 1e-06,
33
- "rope_interleave": true,
34
- "rope_scaling": null,
 
 
35
  "rope_theta": 10000.0,
36
- "routed_scaling_factor": 2.5,
 
37
  "tie_word_embeddings": true,
38
- "topk_group": 4,
39
- "scoring_func": "sigmoid",
40
- "torch_dtype": "float32",
41
- "transformers_version": "4.55.0",
42
- "use_cache": false,
43
- "v_head_dim": 128,
44
  "vocab_size": 129280
45
  }
 
1
  {
2
  "architectures": [
3
+ "Qwen3NextForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 0,
8
+ "decoder_sparse_step": 1,
9
+ "dtype": "float32",
10
  "eos_token_id": 1,
11
+ "head_dim": 128,
 
12
  "hidden_act": "silu",
13
+ "hidden_size": 512,
14
  "initializer_range": 0.02,
15
+ "intermediate_size": 2016,
16
+ "layer_types": [
17
+ "linear_attention",
18
+ "linear_attention",
19
+ "linear_attention",
20
+ "full_attention",
21
+ "linear_attention",
22
+ "linear_attention",
23
+ "linear_attention",
24
+ "full_attention"
25
+ ],
26
+ "linear_conv_kernel_dim": 4,
27
+ "linear_key_head_dim": 64,
28
+ "linear_num_key_heads": 4,
29
+ "linear_num_value_heads": 8,
30
+ "linear_value_head_dim": 64,
31
  "max_position_embeddings": 4096,
32
+ "mlp_only_layers": [],
33
+ "model_type": "qwen3_next",
34
+ "moe_intermediate_size": 224,
35
+ "mtp_beta": 0.3,
 
36
  "norm_topk_prob": true,
37
  "num_attention_heads": 8,
38
+ "num_experts": 32,
39
+ "num_experts_per_tok": 8,
40
  "num_hidden_layers": 8,
41
+ "num_key_value_heads": 2,
42
+ "num_nextn_predict_layers": 1,
43
+ "output_router_logits": false,
44
+ "pad_token_id": 1,
45
+ "partial_rotary_factor": 0.25,
 
46
  "rms_norm_eps": 1e-06,
47
+ "rope_parameters": {
48
+ "rope_theta": 10000.0,
49
+ "rope_type": "default"
50
+ },
51
  "rope_theta": 10000.0,
52
+ "router_aux_loss_coef": 0.001,
53
+ "shared_expert_intermediate_size": 224,
54
  "tie_word_embeddings": true,
55
+ "transformers_version": "5.0.0.dev0",
56
+ "use_cache": true,
57
+ "use_mtp": true,
 
 
 
58
  "vocab_size": 129280
59
  }
generation_config.json CHANGED
@@ -1,6 +1,9 @@
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
- "eos_token_id": 1,
5
- "transformers_version": "4.55.0"
 
 
 
6
  }
 
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
+ "eos_token_id": [
5
+ 1
6
+ ],
7
+ "pad_token_id": 1,
8
+ "transformers_version": "5.0.0.dev0"
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ca39c009e90baa3e65a7feb9b4db1854f00f233135c7284030964e72166d513
3
- size 1034093656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d82762c785a084c730ef6f01bd4a294ebccc6b6f8a6b33fc6d13af6d61094fa7
3
+ size 668921384