Lyric1010 commited on
Commit
bc51930
·
verified ·
1 Parent(s): 271a6e6

Upload config.json to GDN-distill

Browse files
Files changed (1) hide show
  1. config.json +167 -0
config.json ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3NextForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "attn_output_gate": false,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_qwen3_next.Qwen3NextConfig",
10
+ "AutoModel": "modeling_qwen3_next.Qwen3NextForCausalLM",
11
+ "AutoModelForCausalLM": "modeling_qwen3_next.Qwen3NextForCausalLM"
12
+ },
13
+ "bos_token_id": 1,
14
+ "decoder_sparse_step": 1,
15
+ "dtype": "float32",
16
+ "enable_qk_norm": false,
17
+ "eos_token_id": 2,
18
+ "full_attention_interval": 0,
19
+ "head_dim": 64,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 1920,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4800,
24
+ "layer_types": [
25
+ "full_attention",
26
+ "linear_attention",
27
+ "linear_attention",
28
+ "linear_attention",
29
+ "linear_attention",
30
+ "linear_attention",
31
+ "linear_attention",
32
+ "linear_attention",
33
+ "full_attention",
34
+ "linear_attention",
35
+ "linear_attention",
36
+ "linear_attention",
37
+ "linear_attention",
38
+ "linear_attention",
39
+ "linear_attention",
40
+ "linear_attention",
41
+ "full_attention",
42
+ "linear_attention",
43
+ "linear_attention",
44
+ "linear_attention",
45
+ "linear_attention",
46
+ "linear_attention",
47
+ "linear_attention",
48
+ "linear_attention",
49
+ "full_attention",
50
+ "linear_attention",
51
+ "linear_attention",
52
+ "linear_attention",
53
+ "linear_attention",
54
+ "linear_attention",
55
+ "linear_attention",
56
+ "linear_attention",
57
+ "full_attention",
58
+ "linear_attention",
59
+ "linear_attention",
60
+ "linear_attention",
61
+ "linear_attention",
62
+ "linear_attention",
63
+ "linear_attention",
64
+ "linear_attention",
65
+ "full_attention",
66
+ "linear_attention",
67
+ "linear_attention",
68
+ "linear_attention",
69
+ "linear_attention",
70
+ "linear_attention",
71
+ "linear_attention",
72
+ "linear_attention",
73
+ "full_attention",
74
+ "linear_attention",
75
+ "linear_attention",
76
+ "linear_attention",
77
+ "linear_attention",
78
+ "linear_attention",
79
+ "linear_attention",
80
+ "linear_attention"
81
+ ],
82
+ "linear_conv_kernel_dim": 4,
83
+ "linear_key_head_dim": 64,
84
+ "linear_num_key_heads": 8,
85
+ "linear_num_value_heads": 32,
86
+ "linear_value_head_dim": 64,
87
+ "max_position_embeddings": 32768,
88
+ "mlp_only_layers": [
89
+ 0,
90
+ 1,
91
+ 2,
92
+ 3,
93
+ 4,
94
+ 5,
95
+ 6,
96
+ 7,
97
+ 8,
98
+ 9,
99
+ 10,
100
+ 11,
101
+ 12,
102
+ 13,
103
+ 14,
104
+ 15,
105
+ 16,
106
+ 17,
107
+ 18,
108
+ 19,
109
+ 20,
110
+ 21,
111
+ 22,
112
+ 23,
113
+ 24,
114
+ 25,
115
+ 26,
116
+ 27,
117
+ 28,
118
+ 29,
119
+ 30,
120
+ 31,
121
+ 32,
122
+ 33,
123
+ 34,
124
+ 35,
125
+ 36,
126
+ 37,
127
+ 38,
128
+ 39,
129
+ 40,
130
+ 41,
131
+ 42,
132
+ 43,
133
+ 44,
134
+ 45,
135
+ 46,
136
+ 47,
137
+ 48,
138
+ 49,
139
+ 50,
140
+ 51,
141
+ 52,
142
+ 53,
143
+ 54,
144
+ 55
145
+ ],
146
+ "model_type": "qwen3_next",
147
+ "moe_intermediate_size": 0,
148
+ "norm_topk_prob": true,
149
+ "num_attention_heads": 30,
150
+ "num_experts": 0,
151
+ "num_experts_per_tok": 0,
152
+ "num_hidden_layers": 56,
153
+ "num_key_value_heads": 6,
154
+ "output_router_logits": false,
155
+ "partial_rotary_factor": 1.0,
156
+ "rms_norm_eps": 1e-05,
157
+ "rope_scaling": null,
158
+ "rope_theta": 490000.0,
159
+ "router_aux_loss_coef": 0.001,
160
+ "router_bias": false,
161
+ "shared_expert_intermediate_size": 0,
162
+ "tie_word_embeddings": false,
163
+ "transformers_version": "4.57.1",
164
+ "use_cache": true,
165
+ "use_sliding_window": false,
166
+ "vocab_size": 99000
167
+ }