lthn commited on
Commit
b3bba13
·
verified ·
1 Parent(s): e72cffa

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +347 -144
config.json CHANGED
@@ -1,151 +1,354 @@
1
  {
2
- "architectures": [
3
- "Gemma4ForConditionalGeneration"
4
- ],
5
- "audio_config": null,
6
- "audio_token_id": 258881,
7
- "boa_token_id": 256000,
8
- "boi_token_id": 255999,
9
- "dtype": "bfloat16",
10
- "eoa_token_id": 258883,
11
- "eoa_token_index": 258883,
12
- "eoi_token_id": 258882,
13
- "eos_token_id": [
14
- 1,
15
- 106
16
- ],
17
- "image_token_id": 258880,
18
- "initializer_range": 0.02,
19
- "model_type": "gemma4",
20
- "text_config": {
21
- "attention_bias": false,
22
- "attention_dropout": 0.0,
23
- "attention_k_eq_v": true,
24
- "bos_token_id": 2,
25
- "dtype": "bfloat16",
26
- "enable_moe_block": true,
27
- "eos_token_id": 1,
28
- "final_logit_softcapping": 30.0,
29
- "global_head_dim": 512,
30
- "head_dim": 256,
31
- "hidden_activation": "gelu_pytorch_tanh",
32
- "hidden_size": 2816,
33
- "hidden_size_per_layer_input": 0,
34
- "initializer_range": 0.02,
35
- "intermediate_size": 2112,
36
- "layer_types": [
37
- "sliding_attention",
38
- "sliding_attention",
39
- "sliding_attention",
40
- "sliding_attention",
41
- "sliding_attention",
42
- "full_attention",
43
- "sliding_attention",
44
- "sliding_attention",
45
- "sliding_attention",
46
- "sliding_attention",
47
- "sliding_attention",
48
- "full_attention",
49
- "sliding_attention",
50
- "sliding_attention",
51
- "sliding_attention",
52
- "sliding_attention",
53
- "sliding_attention",
54
- "full_attention",
55
- "sliding_attention",
56
- "sliding_attention",
57
- "sliding_attention",
58
- "sliding_attention",
59
- "sliding_attention",
60
- "full_attention",
61
- "sliding_attention",
62
- "sliding_attention",
63
- "sliding_attention",
64
- "sliding_attention",
65
- "sliding_attention",
66
- "full_attention"
67
  ],
68
- "max_position_embeddings": 262144,
69
- "model_type": "gemma4_text",
70
- "moe_intermediate_size": 704,
71
- "num_attention_heads": 16,
72
- "num_experts": 128,
73
- "num_global_key_value_heads": 2,
74
- "num_hidden_layers": 30,
75
- "num_key_value_heads": 8,
76
- "num_kv_shared_layers": 0,
77
- "pad_token_id": 0,
78
- "rms_norm_eps": 1e-06,
79
- "rope_parameters": {
80
- "full_attention": {
81
- "partial_rotary_factor": 0.25,
82
- "rope_theta": 1000000.0,
83
- "rope_type": "proportional"
84
- },
85
- "sliding_attention": {
86
- "rope_theta": 10000.0,
87
- "rope_type": "default"
88
- }
89
- },
90
- "sliding_window": 1024,
91
- "tie_word_embeddings": true,
92
- "top_k_experts": 8,
93
- "use_bidirectional_attention": "vision",
94
- "use_cache": true,
95
- "use_double_wide_mlp": false,
96
- "vocab_size": 262144,
97
- "vocab_size_per_layer_input": 262144
98
- },
99
- "tie_word_embeddings": true,
100
- "transformers_version": "5.5.0.dev0",
101
- "video_token_id": 258884,
102
- "vision_config": {
103
- "_name_or_path": "",
104
- "architectures": null,
105
- "attention_bias": false,
106
- "attention_dropout": 0.0,
107
- "chunk_size_feed_forward": 0,
108
- "default_output_length": 280,
109
  "dtype": "bfloat16",
110
- "global_head_dim": 72,
111
- "head_dim": 72,
112
- "hidden_activation": "gelu_pytorch_tanh",
113
- "hidden_size": 1152,
114
- "id2label": {
115
- "0": "LABEL_0",
116
- "1": "LABEL_1"
117
- },
 
118
  "initializer_range": 0.02,
119
- "intermediate_size": 4304,
120
- "is_encoder_decoder": false,
121
- "label2id": {
122
- "LABEL_0": 0,
123
- "LABEL_1": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  },
125
- "max_position_embeddings": 131072,
126
- "model_type": "gemma4_vision",
127
- "num_attention_heads": 16,
128
- "num_hidden_layers": 27,
129
- "num_key_value_heads": 16,
130
- "output_attentions": false,
131
- "output_hidden_states": false,
132
- "patch_size": 16,
133
- "pooling_kernel_size": 3,
134
- "position_embedding_size": 10240,
135
- "problem_type": null,
136
- "return_dict": true,
137
- "rms_norm_eps": 1e-06,
138
- "rope_parameters": {
139
- "rope_theta": 100.0,
140
- "rope_type": "default"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  },
142
- "standardize": true,
143
- "use_clipped_linears": false
144
- },
145
- "vision_soft_tokens_per_image": 280,
146
- "quantization_config": {
147
- "bits": 4,
148
- "group_size": 64,
149
- "mode": "affine"
150
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
 
1
  {
2
+ "architectures": [
3
+ "Gemma4ForConditionalGeneration"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  ],
5
+ "audio_config": null,
6
+ "audio_token_id": 258881,
7
+ "boa_token_id": 256000,
8
+ "boi_token_id": 255999,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "dtype": "bfloat16",
10
+ "eoa_token_id": 258883,
11
+ "eoa_token_index": 258883,
12
+ "eoi_token_id": 258882,
13
+ "eos_token_id": [
14
+ 1,
15
+ 106,
16
+ 50
17
+ ],
18
+ "image_token_id": 258880,
19
  "initializer_range": 0.02,
20
+ "model_type": "gemma4",
21
+ "quantization": {
22
+ "group_size": 64,
23
+ "bits": 4,
24
+ "mode": "affine",
25
+ "language_model.model.layers.0.router.proj": {
26
+ "group_size": 64,
27
+ "bits": 8
28
+ },
29
+ "language_model.model.layers.1.router.proj": {
30
+ "group_size": 64,
31
+ "bits": 8
32
+ },
33
+ "language_model.model.layers.2.router.proj": {
34
+ "group_size": 64,
35
+ "bits": 8
36
+ },
37
+ "language_model.model.layers.3.router.proj": {
38
+ "group_size": 64,
39
+ "bits": 8
40
+ },
41
+ "language_model.model.layers.4.router.proj": {
42
+ "group_size": 64,
43
+ "bits": 8
44
+ },
45
+ "language_model.model.layers.5.router.proj": {
46
+ "group_size": 64,
47
+ "bits": 8
48
+ },
49
+ "language_model.model.layers.6.router.proj": {
50
+ "group_size": 64,
51
+ "bits": 8
52
+ },
53
+ "language_model.model.layers.7.router.proj": {
54
+ "group_size": 64,
55
+ "bits": 8
56
+ },
57
+ "language_model.model.layers.8.router.proj": {
58
+ "group_size": 64,
59
+ "bits": 8
60
+ },
61
+ "language_model.model.layers.9.router.proj": {
62
+ "group_size": 64,
63
+ "bits": 8
64
+ },
65
+ "language_model.model.layers.10.router.proj": {
66
+ "group_size": 64,
67
+ "bits": 8
68
+ },
69
+ "language_model.model.layers.11.router.proj": {
70
+ "group_size": 64,
71
+ "bits": 8
72
+ },
73
+ "language_model.model.layers.12.router.proj": {
74
+ "group_size": 64,
75
+ "bits": 8
76
+ },
77
+ "language_model.model.layers.13.router.proj": {
78
+ "group_size": 64,
79
+ "bits": 8
80
+ },
81
+ "language_model.model.layers.14.router.proj": {
82
+ "group_size": 64,
83
+ "bits": 8
84
+ },
85
+ "language_model.model.layers.15.router.proj": {
86
+ "group_size": 64,
87
+ "bits": 8
88
+ },
89
+ "language_model.model.layers.16.router.proj": {
90
+ "group_size": 64,
91
+ "bits": 8
92
+ },
93
+ "language_model.model.layers.17.router.proj": {
94
+ "group_size": 64,
95
+ "bits": 8
96
+ },
97
+ "language_model.model.layers.18.router.proj": {
98
+ "group_size": 64,
99
+ "bits": 8
100
+ },
101
+ "language_model.model.layers.19.router.proj": {
102
+ "group_size": 64,
103
+ "bits": 8
104
+ },
105
+ "language_model.model.layers.20.router.proj": {
106
+ "group_size": 64,
107
+ "bits": 8
108
+ },
109
+ "language_model.model.layers.21.router.proj": {
110
+ "group_size": 64,
111
+ "bits": 8
112
+ },
113
+ "language_model.model.layers.22.router.proj": {
114
+ "group_size": 64,
115
+ "bits": 8
116
+ },
117
+ "language_model.model.layers.23.router.proj": {
118
+ "group_size": 64,
119
+ "bits": 8
120
+ },
121
+ "language_model.model.layers.24.router.proj": {
122
+ "group_size": 64,
123
+ "bits": 8
124
+ },
125
+ "language_model.model.layers.25.router.proj": {
126
+ "group_size": 64,
127
+ "bits": 8
128
+ },
129
+ "language_model.model.layers.26.router.proj": {
130
+ "group_size": 64,
131
+ "bits": 8
132
+ },
133
+ "language_model.model.layers.27.router.proj": {
134
+ "group_size": 64,
135
+ "bits": 8
136
+ },
137
+ "language_model.model.layers.28.router.proj": {
138
+ "group_size": 64,
139
+ "bits": 8
140
+ },
141
+ "language_model.model.layers.29.router.proj": {
142
+ "group_size": 64,
143
+ "bits": 8
144
+ }
145
  },
146
+ "quantization_config": {
147
+ "group_size": 64,
148
+ "bits": 4,
149
+ "mode": "affine",
150
+ "language_model.model.layers.0.router.proj": {
151
+ "group_size": 64,
152
+ "bits": 8
153
+ },
154
+ "language_model.model.layers.1.router.proj": {
155
+ "group_size": 64,
156
+ "bits": 8
157
+ },
158
+ "language_model.model.layers.2.router.proj": {
159
+ "group_size": 64,
160
+ "bits": 8
161
+ },
162
+ "language_model.model.layers.3.router.proj": {
163
+ "group_size": 64,
164
+ "bits": 8
165
+ },
166
+ "language_model.model.layers.4.router.proj": {
167
+ "group_size": 64,
168
+ "bits": 8
169
+ },
170
+ "language_model.model.layers.5.router.proj": {
171
+ "group_size": 64,
172
+ "bits": 8
173
+ },
174
+ "language_model.model.layers.6.router.proj": {
175
+ "group_size": 64,
176
+ "bits": 8
177
+ },
178
+ "language_model.model.layers.7.router.proj": {
179
+ "group_size": 64,
180
+ "bits": 8
181
+ },
182
+ "language_model.model.layers.8.router.proj": {
183
+ "group_size": 64,
184
+ "bits": 8
185
+ },
186
+ "language_model.model.layers.9.router.proj": {
187
+ "group_size": 64,
188
+ "bits": 8
189
+ },
190
+ "language_model.model.layers.10.router.proj": {
191
+ "group_size": 64,
192
+ "bits": 8
193
+ },
194
+ "language_model.model.layers.11.router.proj": {
195
+ "group_size": 64,
196
+ "bits": 8
197
+ },
198
+ "language_model.model.layers.12.router.proj": {
199
+ "group_size": 64,
200
+ "bits": 8
201
+ },
202
+ "language_model.model.layers.13.router.proj": {
203
+ "group_size": 64,
204
+ "bits": 8
205
+ },
206
+ "language_model.model.layers.14.router.proj": {
207
+ "group_size": 64,
208
+ "bits": 8
209
+ },
210
+ "language_model.model.layers.15.router.proj": {
211
+ "group_size": 64,
212
+ "bits": 8
213
+ },
214
+ "language_model.model.layers.16.router.proj": {
215
+ "group_size": 64,
216
+ "bits": 8
217
+ },
218
+ "language_model.model.layers.17.router.proj": {
219
+ "group_size": 64,
220
+ "bits": 8
221
+ },
222
+ "language_model.model.layers.18.router.proj": {
223
+ "group_size": 64,
224
+ "bits": 8
225
+ },
226
+ "language_model.model.layers.19.router.proj": {
227
+ "group_size": 64,
228
+ "bits": 8
229
+ },
230
+ "language_model.model.layers.20.router.proj": {
231
+ "group_size": 64,
232
+ "bits": 8
233
+ },
234
+ "language_model.model.layers.21.router.proj": {
235
+ "group_size": 64,
236
+ "bits": 8
237
+ },
238
+ "language_model.model.layers.22.router.proj": {
239
+ "group_size": 64,
240
+ "bits": 8
241
+ },
242
+ "language_model.model.layers.23.router.proj": {
243
+ "group_size": 64,
244
+ "bits": 8
245
+ },
246
+ "language_model.model.layers.24.router.proj": {
247
+ "group_size": 64,
248
+ "bits": 8
249
+ },
250
+ "language_model.model.layers.25.router.proj": {
251
+ "group_size": 64,
252
+ "bits": 8
253
+ },
254
+ "language_model.model.layers.26.router.proj": {
255
+ "group_size": 64,
256
+ "bits": 8
257
+ },
258
+ "language_model.model.layers.27.router.proj": {
259
+ "group_size": 64,
260
+ "bits": 8
261
+ },
262
+ "language_model.model.layers.28.router.proj": {
263
+ "group_size": 64,
264
+ "bits": 8
265
+ },
266
+ "language_model.model.layers.29.router.proj": {
267
+ "group_size": 64,
268
+ "bits": 8
269
+ }
270
  },
271
+ "text_config": {
272
+ "attention_bias": false,
273
+ "attention_dropout": 0.0,
274
+ "attention_k_eq_v": true,
275
+ "bos_token_id": 2,
276
+ "dtype": "bfloat16",
277
+ "enable_moe_block": true,
278
+ "eos_token_id": 1,
279
+ "final_logit_softcapping": 30.0,
280
+ "global_head_dim": 512,
281
+ "head_dim": 256,
282
+ "hidden_activation": "gelu_pytorch_tanh",
283
+ "hidden_size": 2816,
284
+ "hidden_size_per_layer_input": 0,
285
+ "initializer_range": 0.02,
286
+ "intermediate_size": 2112,
287
+ "layer_types": [
288
+ "sliding_attention",
289
+ "sliding_attention",
290
+ "sliding_attention",
291
+ "sliding_attention",
292
+ "sliding_attention",
293
+ "full_attention",
294
+ "sliding_attention",
295
+ "sliding_attention",
296
+ "sliding_attention",
297
+ "sliding_attention",
298
+ "sliding_attention",
299
+ "full_attention",
300
+ "sliding_attention",
301
+ "sliding_attention",
302
+ "sliding_attention",
303
+ "sliding_attention",
304
+ "sliding_attention",
305
+ "full_attention",
306
+ "sliding_attention",
307
+ "sliding_attention",
308
+ "sliding_attention",
309
+ "sliding_attention",
310
+ "sliding_attention",
311
+ "full_attention",
312
+ "sliding_attention",
313
+ "sliding_attention",
314
+ "sliding_attention",
315
+ "sliding_attention",
316
+ "sliding_attention",
317
+ "full_attention"
318
+ ],
319
+ "max_position_embeddings": 262144,
320
+ "model_type": "gemma4_text",
321
+ "moe_intermediate_size": 704,
322
+ "num_attention_heads": 16,
323
+ "num_experts": 128,
324
+ "num_global_key_value_heads": 2,
325
+ "num_hidden_layers": 30,
326
+ "num_key_value_heads": 8,
327
+ "num_kv_shared_layers": 0,
328
+ "pad_token_id": 0,
329
+ "rms_norm_eps": 1e-06,
330
+ "rope_parameters": {
331
+ "full_attention": {
332
+ "partial_rotary_factor": 0.25,
333
+ "rope_theta": 1000000.0,
334
+ "rope_type": "proportional"
335
+ },
336
+ "sliding_attention": {
337
+ "rope_theta": 10000.0,
338
+ "rope_type": "default"
339
+ }
340
+ },
341
+ "sliding_window": 1024,
342
+ "tie_word_embeddings": true,
343
+ "top_k_experts": 8,
344
+ "use_bidirectional_attention": "vision",
345
+ "use_cache": true,
346
+ "use_double_wide_mlp": false,
347
+ "vocab_size": 262144,
348
+ "vocab_size_per_layer_input": 262144
349
+ },
350
+ "tie_word_embeddings": true,
351
+ "transformers_version": "5.5.0.dev0",
352
+ "video_token_id": 258884,
353
+ "vision_soft_tokens_per_image": 280
354
  }