mrtoots commited on
Commit
a9e840c
·
verified ·
1 Parent(s): 6e17eab

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +873 -0
config.json ADDED
@@ -0,0 +1,873 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3NextForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0,
7
+ "decoder_sparse_step": 1,
8
+ "eos_token_id": 151645,
9
+ "full_attention_interval": 4,
10
+ "head_dim": 256,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 5120,
15
+ "layer_types": [
16
+ "linear_attention",
17
+ "linear_attention",
18
+ "linear_attention",
19
+ "full_attention",
20
+ "linear_attention",
21
+ "linear_attention",
22
+ "linear_attention",
23
+ "full_attention",
24
+ "linear_attention",
25
+ "linear_attention",
26
+ "linear_attention",
27
+ "full_attention",
28
+ "linear_attention",
29
+ "linear_attention",
30
+ "linear_attention",
31
+ "full_attention",
32
+ "linear_attention",
33
+ "linear_attention",
34
+ "linear_attention",
35
+ "full_attention",
36
+ "linear_attention",
37
+ "linear_attention",
38
+ "linear_attention",
39
+ "full_attention",
40
+ "linear_attention",
41
+ "linear_attention",
42
+ "linear_attention",
43
+ "full_attention",
44
+ "linear_attention",
45
+ "linear_attention",
46
+ "linear_attention",
47
+ "full_attention",
48
+ "linear_attention",
49
+ "linear_attention",
50
+ "linear_attention",
51
+ "full_attention",
52
+ "linear_attention",
53
+ "linear_attention",
54
+ "linear_attention",
55
+ "full_attention",
56
+ "linear_attention",
57
+ "linear_attention",
58
+ "linear_attention",
59
+ "full_attention",
60
+ "linear_attention",
61
+ "linear_attention",
62
+ "linear_attention",
63
+ "full_attention"
64
+ ],
65
+ "linear_conv_kernel_dim": 4,
66
+ "linear_key_head_dim": 128,
67
+ "linear_num_key_heads": 16,
68
+ "linear_num_value_heads": 32,
69
+ "linear_value_head_dim": 128,
70
+ "max_position_embeddings": 262144,
71
+ "mlp_only_layers": [],
72
+ "model_type": "qwen3_next",
73
+ "moe_intermediate_size": 512,
74
+ "norm_topk_prob": true,
75
+ "num_attention_heads": 16,
76
+ "num_experts": 512,
77
+ "num_experts_per_tok": 10,
78
+ "num_hidden_layers": 48,
79
+ "num_key_value_heads": 2,
80
+ "output_router_logits": false,
81
+ "pad_token_id": 151654,
82
+ "partial_rotary_factor": 0.25,
83
+ "quantization": {
84
+ "group_size": 64,
85
+ "bits": 8,
86
+ "mode": "affine",
87
+ "model.layers.0.mlp.gate": {
88
+ "group_size": 64,
89
+ "bits": 8
90
+ },
91
+ "model.layers.0.mlp.shared_expert_gate": {
92
+ "group_size": 64,
93
+ "bits": 8
94
+ },
95
+ "model.layers.1.mlp.gate": {
96
+ "group_size": 64,
97
+ "bits": 8
98
+ },
99
+ "model.layers.1.mlp.shared_expert_gate": {
100
+ "group_size": 64,
101
+ "bits": 8
102
+ },
103
+ "model.layers.2.mlp.gate": {
104
+ "group_size": 64,
105
+ "bits": 8
106
+ },
107
+ "model.layers.2.mlp.shared_expert_gate": {
108
+ "group_size": 64,
109
+ "bits": 8
110
+ },
111
+ "model.layers.3.mlp.gate": {
112
+ "group_size": 64,
113
+ "bits": 8
114
+ },
115
+ "model.layers.3.mlp.shared_expert_gate": {
116
+ "group_size": 64,
117
+ "bits": 8
118
+ },
119
+ "model.layers.4.mlp.gate": {
120
+ "group_size": 64,
121
+ "bits": 8
122
+ },
123
+ "model.layers.4.mlp.shared_expert_gate": {
124
+ "group_size": 64,
125
+ "bits": 8
126
+ },
127
+ "model.layers.5.mlp.gate": {
128
+ "group_size": 64,
129
+ "bits": 8
130
+ },
131
+ "model.layers.5.mlp.shared_expert_gate": {
132
+ "group_size": 64,
133
+ "bits": 8
134
+ },
135
+ "model.layers.6.mlp.gate": {
136
+ "group_size": 64,
137
+ "bits": 8
138
+ },
139
+ "model.layers.6.mlp.shared_expert_gate": {
140
+ "group_size": 64,
141
+ "bits": 8
142
+ },
143
+ "model.layers.7.mlp.gate": {
144
+ "group_size": 64,
145
+ "bits": 8
146
+ },
147
+ "model.layers.7.mlp.shared_expert_gate": {
148
+ "group_size": 64,
149
+ "bits": 8
150
+ },
151
+ "model.layers.8.mlp.gate": {
152
+ "group_size": 64,
153
+ "bits": 8
154
+ },
155
+ "model.layers.8.mlp.shared_expert_gate": {
156
+ "group_size": 64,
157
+ "bits": 8
158
+ },
159
+ "model.layers.9.mlp.gate": {
160
+ "group_size": 64,
161
+ "bits": 8
162
+ },
163
+ "model.layers.9.mlp.shared_expert_gate": {
164
+ "group_size": 64,
165
+ "bits": 8
166
+ },
167
+ "model.layers.10.mlp.gate": {
168
+ "group_size": 64,
169
+ "bits": 8
170
+ },
171
+ "model.layers.10.mlp.shared_expert_gate": {
172
+ "group_size": 64,
173
+ "bits": 8
174
+ },
175
+ "model.layers.11.mlp.gate": {
176
+ "group_size": 64,
177
+ "bits": 8
178
+ },
179
+ "model.layers.11.mlp.shared_expert_gate": {
180
+ "group_size": 64,
181
+ "bits": 8
182
+ },
183
+ "model.layers.12.mlp.gate": {
184
+ "group_size": 64,
185
+ "bits": 8
186
+ },
187
+ "model.layers.12.mlp.shared_expert_gate": {
188
+ "group_size": 64,
189
+ "bits": 8
190
+ },
191
+ "model.layers.13.mlp.gate": {
192
+ "group_size": 64,
193
+ "bits": 8
194
+ },
195
+ "model.layers.13.mlp.shared_expert_gate": {
196
+ "group_size": 64,
197
+ "bits": 8
198
+ },
199
+ "model.layers.14.mlp.gate": {
200
+ "group_size": 64,
201
+ "bits": 8
202
+ },
203
+ "model.layers.14.mlp.shared_expert_gate": {
204
+ "group_size": 64,
205
+ "bits": 8
206
+ },
207
+ "model.layers.15.mlp.gate": {
208
+ "group_size": 64,
209
+ "bits": 8
210
+ },
211
+ "model.layers.15.mlp.shared_expert_gate": {
212
+ "group_size": 64,
213
+ "bits": 8
214
+ },
215
+ "model.layers.16.mlp.gate": {
216
+ "group_size": 64,
217
+ "bits": 8
218
+ },
219
+ "model.layers.16.mlp.shared_expert_gate": {
220
+ "group_size": 64,
221
+ "bits": 8
222
+ },
223
+ "model.layers.17.mlp.gate": {
224
+ "group_size": 64,
225
+ "bits": 8
226
+ },
227
+ "model.layers.17.mlp.shared_expert_gate": {
228
+ "group_size": 64,
229
+ "bits": 8
230
+ },
231
+ "model.layers.18.mlp.gate": {
232
+ "group_size": 64,
233
+ "bits": 8
234
+ },
235
+ "model.layers.18.mlp.shared_expert_gate": {
236
+ "group_size": 64,
237
+ "bits": 8
238
+ },
239
+ "model.layers.19.mlp.gate": {
240
+ "group_size": 64,
241
+ "bits": 8
242
+ },
243
+ "model.layers.19.mlp.shared_expert_gate": {
244
+ "group_size": 64,
245
+ "bits": 8
246
+ },
247
+ "model.layers.20.mlp.gate": {
248
+ "group_size": 64,
249
+ "bits": 8
250
+ },
251
+ "model.layers.20.mlp.shared_expert_gate": {
252
+ "group_size": 64,
253
+ "bits": 8
254
+ },
255
+ "model.layers.21.mlp.gate": {
256
+ "group_size": 64,
257
+ "bits": 8
258
+ },
259
+ "model.layers.21.mlp.shared_expert_gate": {
260
+ "group_size": 64,
261
+ "bits": 8
262
+ },
263
+ "model.layers.22.mlp.gate": {
264
+ "group_size": 64,
265
+ "bits": 8
266
+ },
267
+ "model.layers.22.mlp.shared_expert_gate": {
268
+ "group_size": 64,
269
+ "bits": 8
270
+ },
271
+ "model.layers.23.mlp.gate": {
272
+ "group_size": 64,
273
+ "bits": 8
274
+ },
275
+ "model.layers.23.mlp.shared_expert_gate": {
276
+ "group_size": 64,
277
+ "bits": 8
278
+ },
279
+ "model.layers.24.mlp.gate": {
280
+ "group_size": 64,
281
+ "bits": 8
282
+ },
283
+ "model.layers.24.mlp.shared_expert_gate": {
284
+ "group_size": 64,
285
+ "bits": 8
286
+ },
287
+ "model.layers.25.mlp.gate": {
288
+ "group_size": 64,
289
+ "bits": 8
290
+ },
291
+ "model.layers.25.mlp.shared_expert_gate": {
292
+ "group_size": 64,
293
+ "bits": 8
294
+ },
295
+ "model.layers.26.mlp.gate": {
296
+ "group_size": 64,
297
+ "bits": 8
298
+ },
299
+ "model.layers.26.mlp.shared_expert_gate": {
300
+ "group_size": 64,
301
+ "bits": 8
302
+ },
303
+ "model.layers.27.mlp.gate": {
304
+ "group_size": 64,
305
+ "bits": 8
306
+ },
307
+ "model.layers.27.mlp.shared_expert_gate": {
308
+ "group_size": 64,
309
+ "bits": 8
310
+ },
311
+ "model.layers.28.mlp.gate": {
312
+ "group_size": 64,
313
+ "bits": 8
314
+ },
315
+ "model.layers.28.mlp.shared_expert_gate": {
316
+ "group_size": 64,
317
+ "bits": 8
318
+ },
319
+ "model.layers.29.mlp.gate": {
320
+ "group_size": 64,
321
+ "bits": 8
322
+ },
323
+ "model.layers.29.mlp.shared_expert_gate": {
324
+ "group_size": 64,
325
+ "bits": 8
326
+ },
327
+ "model.layers.30.mlp.gate": {
328
+ "group_size": 64,
329
+ "bits": 8
330
+ },
331
+ "model.layers.30.mlp.shared_expert_gate": {
332
+ "group_size": 64,
333
+ "bits": 8
334
+ },
335
+ "model.layers.31.mlp.gate": {
336
+ "group_size": 64,
337
+ "bits": 8
338
+ },
339
+ "model.layers.31.mlp.shared_expert_gate": {
340
+ "group_size": 64,
341
+ "bits": 8
342
+ },
343
+ "model.layers.32.mlp.gate": {
344
+ "group_size": 64,
345
+ "bits": 8
346
+ },
347
+ "model.layers.32.mlp.shared_expert_gate": {
348
+ "group_size": 64,
349
+ "bits": 8
350
+ },
351
+ "model.layers.33.mlp.gate": {
352
+ "group_size": 64,
353
+ "bits": 8
354
+ },
355
+ "model.layers.33.mlp.shared_expert_gate": {
356
+ "group_size": 64,
357
+ "bits": 8
358
+ },
359
+ "model.layers.34.mlp.gate": {
360
+ "group_size": 64,
361
+ "bits": 8
362
+ },
363
+ "model.layers.34.mlp.shared_expert_gate": {
364
+ "group_size": 64,
365
+ "bits": 8
366
+ },
367
+ "model.layers.35.mlp.gate": {
368
+ "group_size": 64,
369
+ "bits": 8
370
+ },
371
+ "model.layers.35.mlp.shared_expert_gate": {
372
+ "group_size": 64,
373
+ "bits": 8
374
+ },
375
+ "model.layers.36.mlp.gate": {
376
+ "group_size": 64,
377
+ "bits": 8
378
+ },
379
+ "model.layers.36.mlp.shared_expert_gate": {
380
+ "group_size": 64,
381
+ "bits": 8
382
+ },
383
+ "model.layers.37.mlp.gate": {
384
+ "group_size": 64,
385
+ "bits": 8
386
+ },
387
+ "model.layers.37.mlp.shared_expert_gate": {
388
+ "group_size": 64,
389
+ "bits": 8
390
+ },
391
+ "model.layers.38.mlp.gate": {
392
+ "group_size": 64,
393
+ "bits": 8
394
+ },
395
+ "model.layers.38.mlp.shared_expert_gate": {
396
+ "group_size": 64,
397
+ "bits": 8
398
+ },
399
+ "model.layers.39.mlp.gate": {
400
+ "group_size": 64,
401
+ "bits": 8
402
+ },
403
+ "model.layers.39.mlp.shared_expert_gate": {
404
+ "group_size": 64,
405
+ "bits": 8
406
+ },
407
+ "model.layers.40.mlp.gate": {
408
+ "group_size": 64,
409
+ "bits": 8
410
+ },
411
+ "model.layers.40.mlp.shared_expert_gate": {
412
+ "group_size": 64,
413
+ "bits": 8
414
+ },
415
+ "model.layers.41.mlp.gate": {
416
+ "group_size": 64,
417
+ "bits": 8
418
+ },
419
+ "model.layers.41.mlp.shared_expert_gate": {
420
+ "group_size": 64,
421
+ "bits": 8
422
+ },
423
+ "model.layers.42.mlp.gate": {
424
+ "group_size": 64,
425
+ "bits": 8
426
+ },
427
+ "model.layers.42.mlp.shared_expert_gate": {
428
+ "group_size": 64,
429
+ "bits": 8
430
+ },
431
+ "model.layers.43.mlp.gate": {
432
+ "group_size": 64,
433
+ "bits": 8
434
+ },
435
+ "model.layers.43.mlp.shared_expert_gate": {
436
+ "group_size": 64,
437
+ "bits": 8
438
+ },
439
+ "model.layers.44.mlp.gate": {
440
+ "group_size": 64,
441
+ "bits": 8
442
+ },
443
+ "model.layers.44.mlp.shared_expert_gate": {
444
+ "group_size": 64,
445
+ "bits": 8
446
+ },
447
+ "model.layers.45.mlp.gate": {
448
+ "group_size": 64,
449
+ "bits": 8
450
+ },
451
+ "model.layers.45.mlp.shared_expert_gate": {
452
+ "group_size": 64,
453
+ "bits": 8
454
+ },
455
+ "model.layers.46.mlp.gate": {
456
+ "group_size": 64,
457
+ "bits": 8
458
+ },
459
+ "model.layers.46.mlp.shared_expert_gate": {
460
+ "group_size": 64,
461
+ "bits": 8
462
+ },
463
+ "model.layers.47.mlp.gate": {
464
+ "group_size": 64,
465
+ "bits": 8
466
+ },
467
+ "model.layers.47.mlp.shared_expert_gate": {
468
+ "group_size": 64,
469
+ "bits": 8
470
+ }
471
+ },
472
+ "quantization_config": {
473
+ "group_size": 64,
474
+ "bits": 8,
475
+ "mode": "affine",
476
+ "model.layers.0.mlp.gate": {
477
+ "group_size": 64,
478
+ "bits": 8
479
+ },
480
+ "model.layers.0.mlp.shared_expert_gate": {
481
+ "group_size": 64,
482
+ "bits": 8
483
+ },
484
+ "model.layers.1.mlp.gate": {
485
+ "group_size": 64,
486
+ "bits": 8
487
+ },
488
+ "model.layers.1.mlp.shared_expert_gate": {
489
+ "group_size": 64,
490
+ "bits": 8
491
+ },
492
+ "model.layers.2.mlp.gate": {
493
+ "group_size": 64,
494
+ "bits": 8
495
+ },
496
+ "model.layers.2.mlp.shared_expert_gate": {
497
+ "group_size": 64,
498
+ "bits": 8
499
+ },
500
+ "model.layers.3.mlp.gate": {
501
+ "group_size": 64,
502
+ "bits": 8
503
+ },
504
+ "model.layers.3.mlp.shared_expert_gate": {
505
+ "group_size": 64,
506
+ "bits": 8
507
+ },
508
+ "model.layers.4.mlp.gate": {
509
+ "group_size": 64,
510
+ "bits": 8
511
+ },
512
+ "model.layers.4.mlp.shared_expert_gate": {
513
+ "group_size": 64,
514
+ "bits": 8
515
+ },
516
+ "model.layers.5.mlp.gate": {
517
+ "group_size": 64,
518
+ "bits": 8
519
+ },
520
+ "model.layers.5.mlp.shared_expert_gate": {
521
+ "group_size": 64,
522
+ "bits": 8
523
+ },
524
+ "model.layers.6.mlp.gate": {
525
+ "group_size": 64,
526
+ "bits": 8
527
+ },
528
+ "model.layers.6.mlp.shared_expert_gate": {
529
+ "group_size": 64,
530
+ "bits": 8
531
+ },
532
+ "model.layers.7.mlp.gate": {
533
+ "group_size": 64,
534
+ "bits": 8
535
+ },
536
+ "model.layers.7.mlp.shared_expert_gate": {
537
+ "group_size": 64,
538
+ "bits": 8
539
+ },
540
+ "model.layers.8.mlp.gate": {
541
+ "group_size": 64,
542
+ "bits": 8
543
+ },
544
+ "model.layers.8.mlp.shared_expert_gate": {
545
+ "group_size": 64,
546
+ "bits": 8
547
+ },
548
+ "model.layers.9.mlp.gate": {
549
+ "group_size": 64,
550
+ "bits": 8
551
+ },
552
+ "model.layers.9.mlp.shared_expert_gate": {
553
+ "group_size": 64,
554
+ "bits": 8
555
+ },
556
+ "model.layers.10.mlp.gate": {
557
+ "group_size": 64,
558
+ "bits": 8
559
+ },
560
+ "model.layers.10.mlp.shared_expert_gate": {
561
+ "group_size": 64,
562
+ "bits": 8
563
+ },
564
+ "model.layers.11.mlp.gate": {
565
+ "group_size": 64,
566
+ "bits": 8
567
+ },
568
+ "model.layers.11.mlp.shared_expert_gate": {
569
+ "group_size": 64,
570
+ "bits": 8
571
+ },
572
+ "model.layers.12.mlp.gate": {
573
+ "group_size": 64,
574
+ "bits": 8
575
+ },
576
+ "model.layers.12.mlp.shared_expert_gate": {
577
+ "group_size": 64,
578
+ "bits": 8
579
+ },
580
+ "model.layers.13.mlp.gate": {
581
+ "group_size": 64,
582
+ "bits": 8
583
+ },
584
+ "model.layers.13.mlp.shared_expert_gate": {
585
+ "group_size": 64,
586
+ "bits": 8
587
+ },
588
+ "model.layers.14.mlp.gate": {
589
+ "group_size": 64,
590
+ "bits": 8
591
+ },
592
+ "model.layers.14.mlp.shared_expert_gate": {
593
+ "group_size": 64,
594
+ "bits": 8
595
+ },
596
+ "model.layers.15.mlp.gate": {
597
+ "group_size": 64,
598
+ "bits": 8
599
+ },
600
+ "model.layers.15.mlp.shared_expert_gate": {
601
+ "group_size": 64,
602
+ "bits": 8
603
+ },
604
+ "model.layers.16.mlp.gate": {
605
+ "group_size": 64,
606
+ "bits": 8
607
+ },
608
+ "model.layers.16.mlp.shared_expert_gate": {
609
+ "group_size": 64,
610
+ "bits": 8
611
+ },
612
+ "model.layers.17.mlp.gate": {
613
+ "group_size": 64,
614
+ "bits": 8
615
+ },
616
+ "model.layers.17.mlp.shared_expert_gate": {
617
+ "group_size": 64,
618
+ "bits": 8
619
+ },
620
+ "model.layers.18.mlp.gate": {
621
+ "group_size": 64,
622
+ "bits": 8
623
+ },
624
+ "model.layers.18.mlp.shared_expert_gate": {
625
+ "group_size": 64,
626
+ "bits": 8
627
+ },
628
+ "model.layers.19.mlp.gate": {
629
+ "group_size": 64,
630
+ "bits": 8
631
+ },
632
+ "model.layers.19.mlp.shared_expert_gate": {
633
+ "group_size": 64,
634
+ "bits": 8
635
+ },
636
+ "model.layers.20.mlp.gate": {
637
+ "group_size": 64,
638
+ "bits": 8
639
+ },
640
+ "model.layers.20.mlp.shared_expert_gate": {
641
+ "group_size": 64,
642
+ "bits": 8
643
+ },
644
+ "model.layers.21.mlp.gate": {
645
+ "group_size": 64,
646
+ "bits": 8
647
+ },
648
+ "model.layers.21.mlp.shared_expert_gate": {
649
+ "group_size": 64,
650
+ "bits": 8
651
+ },
652
+ "model.layers.22.mlp.gate": {
653
+ "group_size": 64,
654
+ "bits": 8
655
+ },
656
+ "model.layers.22.mlp.shared_expert_gate": {
657
+ "group_size": 64,
658
+ "bits": 8
659
+ },
660
+ "model.layers.23.mlp.gate": {
661
+ "group_size": 64,
662
+ "bits": 8
663
+ },
664
+ "model.layers.23.mlp.shared_expert_gate": {
665
+ "group_size": 64,
666
+ "bits": 8
667
+ },
668
+ "model.layers.24.mlp.gate": {
669
+ "group_size": 64,
670
+ "bits": 8
671
+ },
672
+ "model.layers.24.mlp.shared_expert_gate": {
673
+ "group_size": 64,
674
+ "bits": 8
675
+ },
676
+ "model.layers.25.mlp.gate": {
677
+ "group_size": 64,
678
+ "bits": 8
679
+ },
680
+ "model.layers.25.mlp.shared_expert_gate": {
681
+ "group_size": 64,
682
+ "bits": 8
683
+ },
684
+ "model.layers.26.mlp.gate": {
685
+ "group_size": 64,
686
+ "bits": 8
687
+ },
688
+ "model.layers.26.mlp.shared_expert_gate": {
689
+ "group_size": 64,
690
+ "bits": 8
691
+ },
692
+ "model.layers.27.mlp.gate": {
693
+ "group_size": 64,
694
+ "bits": 8
695
+ },
696
+ "model.layers.27.mlp.shared_expert_gate": {
697
+ "group_size": 64,
698
+ "bits": 8
699
+ },
700
+ "model.layers.28.mlp.gate": {
701
+ "group_size": 64,
702
+ "bits": 8
703
+ },
704
+ "model.layers.28.mlp.shared_expert_gate": {
705
+ "group_size": 64,
706
+ "bits": 8
707
+ },
708
+ "model.layers.29.mlp.gate": {
709
+ "group_size": 64,
710
+ "bits": 8
711
+ },
712
+ "model.layers.29.mlp.shared_expert_gate": {
713
+ "group_size": 64,
714
+ "bits": 8
715
+ },
716
+ "model.layers.30.mlp.gate": {
717
+ "group_size": 64,
718
+ "bits": 8
719
+ },
720
+ "model.layers.30.mlp.shared_expert_gate": {
721
+ "group_size": 64,
722
+ "bits": 8
723
+ },
724
+ "model.layers.31.mlp.gate": {
725
+ "group_size": 64,
726
+ "bits": 8
727
+ },
728
+ "model.layers.31.mlp.shared_expert_gate": {
729
+ "group_size": 64,
730
+ "bits": 8
731
+ },
732
+ "model.layers.32.mlp.gate": {
733
+ "group_size": 64,
734
+ "bits": 8
735
+ },
736
+ "model.layers.32.mlp.shared_expert_gate": {
737
+ "group_size": 64,
738
+ "bits": 8
739
+ },
740
+ "model.layers.33.mlp.gate": {
741
+ "group_size": 64,
742
+ "bits": 8
743
+ },
744
+ "model.layers.33.mlp.shared_expert_gate": {
745
+ "group_size": 64,
746
+ "bits": 8
747
+ },
748
+ "model.layers.34.mlp.gate": {
749
+ "group_size": 64,
750
+ "bits": 8
751
+ },
752
+ "model.layers.34.mlp.shared_expert_gate": {
753
+ "group_size": 64,
754
+ "bits": 8
755
+ },
756
+ "model.layers.35.mlp.gate": {
757
+ "group_size": 64,
758
+ "bits": 8
759
+ },
760
+ "model.layers.35.mlp.shared_expert_gate": {
761
+ "group_size": 64,
762
+ "bits": 8
763
+ },
764
+ "model.layers.36.mlp.gate": {
765
+ "group_size": 64,
766
+ "bits": 8
767
+ },
768
+ "model.layers.36.mlp.shared_expert_gate": {
769
+ "group_size": 64,
770
+ "bits": 8
771
+ },
772
+ "model.layers.37.mlp.gate": {
773
+ "group_size": 64,
774
+ "bits": 8
775
+ },
776
+ "model.layers.37.mlp.shared_expert_gate": {
777
+ "group_size": 64,
778
+ "bits": 8
779
+ },
780
+ "model.layers.38.mlp.gate": {
781
+ "group_size": 64,
782
+ "bits": 8
783
+ },
784
+ "model.layers.38.mlp.shared_expert_gate": {
785
+ "group_size": 64,
786
+ "bits": 8
787
+ },
788
+ "model.layers.39.mlp.gate": {
789
+ "group_size": 64,
790
+ "bits": 8
791
+ },
792
+ "model.layers.39.mlp.shared_expert_gate": {
793
+ "group_size": 64,
794
+ "bits": 8
795
+ },
796
+ "model.layers.40.mlp.gate": {
797
+ "group_size": 64,
798
+ "bits": 8
799
+ },
800
+ "model.layers.40.mlp.shared_expert_gate": {
801
+ "group_size": 64,
802
+ "bits": 8
803
+ },
804
+ "model.layers.41.mlp.gate": {
805
+ "group_size": 64,
806
+ "bits": 8
807
+ },
808
+ "model.layers.41.mlp.shared_expert_gate": {
809
+ "group_size": 64,
810
+ "bits": 8
811
+ },
812
+ "model.layers.42.mlp.gate": {
813
+ "group_size": 64,
814
+ "bits": 8
815
+ },
816
+ "model.layers.42.mlp.shared_expert_gate": {
817
+ "group_size": 64,
818
+ "bits": 8
819
+ },
820
+ "model.layers.43.mlp.gate": {
821
+ "group_size": 64,
822
+ "bits": 8
823
+ },
824
+ "model.layers.43.mlp.shared_expert_gate": {
825
+ "group_size": 64,
826
+ "bits": 8
827
+ },
828
+ "model.layers.44.mlp.gate": {
829
+ "group_size": 64,
830
+ "bits": 8
831
+ },
832
+ "model.layers.44.mlp.shared_expert_gate": {
833
+ "group_size": 64,
834
+ "bits": 8
835
+ },
836
+ "model.layers.45.mlp.gate": {
837
+ "group_size": 64,
838
+ "bits": 8
839
+ },
840
+ "model.layers.45.mlp.shared_expert_gate": {
841
+ "group_size": 64,
842
+ "bits": 8
843
+ },
844
+ "model.layers.46.mlp.gate": {
845
+ "group_size": 64,
846
+ "bits": 8
847
+ },
848
+ "model.layers.46.mlp.shared_expert_gate": {
849
+ "group_size": 64,
850
+ "bits": 8
851
+ },
852
+ "model.layers.47.mlp.gate": {
853
+ "group_size": 64,
854
+ "bits": 8
855
+ },
856
+ "model.layers.47.mlp.shared_expert_gate": {
857
+ "group_size": 64,
858
+ "bits": 8
859
+ }
860
+ },
861
+ "rms_norm_eps": 1e-06,
862
+ "rope_scaling": null,
863
+ "rope_theta": 5000000,
864
+ "router_aux_loss_coef": 0.001,
865
+ "shared_expert_intermediate_size": 512,
866
+ "tie_word_embeddings": false,
867
+ "torch_dtype": "bfloat16",
868
+ "transformers_version": "4.57.6",
869
+ "unsloth_fixed": true,
870
+ "use_cache": true,
871
+ "use_sliding_window": false,
872
+ "vocab_size": 151936
873
+ }