lthn commited on
Commit
6e45a60
·
verified ·
1 Parent(s): d320bab

fix(config): sync with upstream Google Gemma 4 day-1 fixes (b446025, b4a6011)

Browse files

- Restore vision_config block; clean eos_token_id to [1, 106]
- Align generation_config.json transformers_version field

Preserves local tokenizer_config.json (model_specific_special_tokens embedded into model during LEK training).

Files changed (1) hide show
  1. config.json +169 -1620
config.json CHANGED
@@ -1,1627 +1,176 @@
1
  {
2
- "architectures": [
3
- "Gemma4ForConditionalGeneration"
4
- ],
5
- "audio_config": null,
6
- "audio_token_id": 258881,
7
- "boa_token_id": 256000,
8
- "boi_token_id": 255999,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "dtype": "bfloat16",
10
- "eoa_token_id": 258883,
11
- "eoa_token_index": 258883,
12
- "eoi_token_id": 258882,
13
- "eos_token_id": [
14
- 1,
15
- 106,
16
- 50
17
- ],
18
- "image_token_id": 258880,
19
  "initializer_range": 0.02,
20
- "model_type": "gemma4",
21
- "quantization": {
22
- "group_size": 64,
23
- "bits": 4,
24
- "mode": "affine",
25
- "language_model.model.layers.0.mlp.gate_proj": {
26
- "group_size": 64,
27
- "bits": 8
28
- },
29
- "language_model.model.layers.0.mlp.down_proj": {
30
- "group_size": 64,
31
- "bits": 8
32
- },
33
- "language_model.model.layers.0.mlp.up_proj": {
34
- "group_size": 64,
35
- "bits": 8
36
- },
37
- "language_model.model.layers.1.mlp.gate_proj": {
38
- "group_size": 64,
39
- "bits": 8
40
- },
41
- "language_model.model.layers.1.mlp.down_proj": {
42
- "group_size": 64,
43
- "bits": 8
44
- },
45
- "language_model.model.layers.1.mlp.up_proj": {
46
- "group_size": 64,
47
- "bits": 8
48
- },
49
- "language_model.model.layers.2.mlp.gate_proj": {
50
- "group_size": 64,
51
- "bits": 8
52
- },
53
- "language_model.model.layers.2.mlp.down_proj": {
54
- "group_size": 64,
55
- "bits": 8
56
- },
57
- "language_model.model.layers.2.mlp.up_proj": {
58
- "group_size": 64,
59
- "bits": 8
60
- },
61
- "language_model.model.layers.3.mlp.gate_proj": {
62
- "group_size": 64,
63
- "bits": 8
64
- },
65
- "language_model.model.layers.3.mlp.down_proj": {
66
- "group_size": 64,
67
- "bits": 8
68
- },
69
- "language_model.model.layers.3.mlp.up_proj": {
70
- "group_size": 64,
71
- "bits": 8
72
- },
73
- "language_model.model.layers.4.mlp.gate_proj": {
74
- "group_size": 64,
75
- "bits": 8
76
- },
77
- "language_model.model.layers.4.mlp.down_proj": {
78
- "group_size": 64,
79
- "bits": 8
80
- },
81
- "language_model.model.layers.4.mlp.up_proj": {
82
- "group_size": 64,
83
- "bits": 8
84
- },
85
- "language_model.model.layers.5.mlp.gate_proj": {
86
- "group_size": 64,
87
- "bits": 8
88
- },
89
- "language_model.model.layers.5.mlp.down_proj": {
90
- "group_size": 64,
91
- "bits": 8
92
- },
93
- "language_model.model.layers.5.mlp.up_proj": {
94
- "group_size": 64,
95
- "bits": 8
96
- },
97
- "language_model.model.layers.6.mlp.gate_proj": {
98
- "group_size": 64,
99
- "bits": 8
100
- },
101
- "language_model.model.layers.6.mlp.down_proj": {
102
- "group_size": 64,
103
- "bits": 8
104
- },
105
- "language_model.model.layers.6.mlp.up_proj": {
106
- "group_size": 64,
107
- "bits": 8
108
- },
109
- "language_model.model.layers.7.mlp.gate_proj": {
110
- "group_size": 64,
111
- "bits": 8
112
- },
113
- "language_model.model.layers.7.mlp.down_proj": {
114
- "group_size": 64,
115
- "bits": 8
116
- },
117
- "language_model.model.layers.7.mlp.up_proj": {
118
- "group_size": 64,
119
- "bits": 8
120
- },
121
- "language_model.model.layers.8.mlp.gate_proj": {
122
- "group_size": 64,
123
- "bits": 8
124
- },
125
- "language_model.model.layers.8.mlp.down_proj": {
126
- "group_size": 64,
127
- "bits": 8
128
- },
129
- "language_model.model.layers.8.mlp.up_proj": {
130
- "group_size": 64,
131
- "bits": 8
132
- },
133
- "language_model.model.layers.9.mlp.gate_proj": {
134
- "group_size": 64,
135
- "bits": 8
136
- },
137
- "language_model.model.layers.9.mlp.down_proj": {
138
- "group_size": 64,
139
- "bits": 8
140
- },
141
- "language_model.model.layers.9.mlp.up_proj": {
142
- "group_size": 64,
143
- "bits": 8
144
- },
145
- "language_model.model.layers.10.mlp.gate_proj": {
146
- "group_size": 64,
147
- "bits": 8
148
- },
149
- "language_model.model.layers.10.mlp.down_proj": {
150
- "group_size": 64,
151
- "bits": 8
152
- },
153
- "language_model.model.layers.10.mlp.up_proj": {
154
- "group_size": 64,
155
- "bits": 8
156
- },
157
- "language_model.model.layers.11.mlp.gate_proj": {
158
- "group_size": 64,
159
- "bits": 8
160
- },
161
- "language_model.model.layers.11.mlp.down_proj": {
162
- "group_size": 64,
163
- "bits": 8
164
- },
165
- "language_model.model.layers.11.mlp.up_proj": {
166
- "group_size": 64,
167
- "bits": 8
168
- },
169
- "language_model.model.layers.12.mlp.gate_proj": {
170
- "group_size": 64,
171
- "bits": 8
172
- },
173
- "language_model.model.layers.12.mlp.down_proj": {
174
- "group_size": 64,
175
- "bits": 8
176
- },
177
- "language_model.model.layers.12.mlp.up_proj": {
178
- "group_size": 64,
179
- "bits": 8
180
- },
181
- "language_model.model.layers.13.mlp.gate_proj": {
182
- "group_size": 64,
183
- "bits": 8
184
- },
185
- "language_model.model.layers.13.mlp.down_proj": {
186
- "group_size": 64,
187
- "bits": 8
188
- },
189
- "language_model.model.layers.13.mlp.up_proj": {
190
- "group_size": 64,
191
- "bits": 8
192
- },
193
- "language_model.model.layers.14.mlp.gate_proj": {
194
- "group_size": 64,
195
- "bits": 8
196
- },
197
- "language_model.model.layers.14.mlp.down_proj": {
198
- "group_size": 64,
199
- "bits": 8
200
- },
201
- "language_model.model.layers.14.mlp.up_proj": {
202
- "group_size": 64,
203
- "bits": 8
204
- },
205
- "language_model.model.layers.15.mlp.gate_proj": {
206
- "group_size": 64,
207
- "bits": 8
208
- },
209
- "language_model.model.layers.15.mlp.down_proj": {
210
- "group_size": 64,
211
- "bits": 8
212
- },
213
- "language_model.model.layers.15.mlp.up_proj": {
214
- "group_size": 64,
215
- "bits": 8
216
- },
217
- "language_model.model.layers.16.mlp.gate_proj": {
218
- "group_size": 64,
219
- "bits": 8
220
- },
221
- "language_model.model.layers.16.mlp.down_proj": {
222
- "group_size": 64,
223
- "bits": 8
224
- },
225
- "language_model.model.layers.16.mlp.up_proj": {
226
- "group_size": 64,
227
- "bits": 8
228
- },
229
- "language_model.model.layers.17.mlp.gate_proj": {
230
- "group_size": 64,
231
- "bits": 8
232
- },
233
- "language_model.model.layers.17.mlp.down_proj": {
234
- "group_size": 64,
235
- "bits": 8
236
- },
237
- "language_model.model.layers.17.mlp.up_proj": {
238
- "group_size": 64,
239
- "bits": 8
240
- },
241
- "language_model.model.layers.18.mlp.gate_proj": {
242
- "group_size": 64,
243
- "bits": 8
244
- },
245
- "language_model.model.layers.18.mlp.down_proj": {
246
- "group_size": 64,
247
- "bits": 8
248
- },
249
- "language_model.model.layers.18.mlp.up_proj": {
250
- "group_size": 64,
251
- "bits": 8
252
- },
253
- "language_model.model.layers.19.mlp.gate_proj": {
254
- "group_size": 64,
255
- "bits": 8
256
- },
257
- "language_model.model.layers.19.mlp.down_proj": {
258
- "group_size": 64,
259
- "bits": 8
260
- },
261
- "language_model.model.layers.19.mlp.up_proj": {
262
- "group_size": 64,
263
- "bits": 8
264
- },
265
- "language_model.model.layers.20.mlp.gate_proj": {
266
- "group_size": 64,
267
- "bits": 8
268
- },
269
- "language_model.model.layers.20.mlp.down_proj": {
270
- "group_size": 64,
271
- "bits": 8
272
- },
273
- "language_model.model.layers.20.mlp.up_proj": {
274
- "group_size": 64,
275
- "bits": 8
276
- },
277
- "language_model.model.layers.21.mlp.gate_proj": {
278
- "group_size": 64,
279
- "bits": 8
280
- },
281
- "language_model.model.layers.21.mlp.down_proj": {
282
- "group_size": 64,
283
- "bits": 8
284
- },
285
- "language_model.model.layers.21.mlp.up_proj": {
286
- "group_size": 64,
287
- "bits": 8
288
- },
289
- "language_model.model.layers.22.mlp.gate_proj": {
290
- "group_size": 64,
291
- "bits": 8
292
- },
293
- "language_model.model.layers.22.mlp.down_proj": {
294
- "group_size": 64,
295
- "bits": 8
296
- },
297
- "language_model.model.layers.22.mlp.up_proj": {
298
- "group_size": 64,
299
- "bits": 8
300
- },
301
- "language_model.model.layers.23.mlp.gate_proj": {
302
- "group_size": 64,
303
- "bits": 8
304
- },
305
- "language_model.model.layers.23.mlp.down_proj": {
306
- "group_size": 64,
307
- "bits": 8
308
- },
309
- "language_model.model.layers.23.mlp.up_proj": {
310
- "group_size": 64,
311
- "bits": 8
312
- },
313
- "language_model.model.layers.24.mlp.gate_proj": {
314
- "group_size": 64,
315
- "bits": 8
316
- },
317
- "language_model.model.layers.24.mlp.down_proj": {
318
- "group_size": 64,
319
- "bits": 8
320
- },
321
- "language_model.model.layers.24.mlp.up_proj": {
322
- "group_size": 64,
323
- "bits": 8
324
- },
325
- "language_model.model.layers.25.mlp.gate_proj": {
326
- "group_size": 64,
327
- "bits": 8
328
- },
329
- "language_model.model.layers.25.mlp.down_proj": {
330
- "group_size": 64,
331
- "bits": 8
332
- },
333
- "language_model.model.layers.25.mlp.up_proj": {
334
- "group_size": 64,
335
- "bits": 8
336
- },
337
- "language_model.model.layers.26.mlp.gate_proj": {
338
- "group_size": 64,
339
- "bits": 8
340
- },
341
- "language_model.model.layers.26.mlp.down_proj": {
342
- "group_size": 64,
343
- "bits": 8
344
- },
345
- "language_model.model.layers.26.mlp.up_proj": {
346
- "group_size": 64,
347
- "bits": 8
348
- },
349
- "language_model.model.layers.27.mlp.gate_proj": {
350
- "group_size": 64,
351
- "bits": 8
352
- },
353
- "language_model.model.layers.27.mlp.down_proj": {
354
- "group_size": 64,
355
- "bits": 8
356
- },
357
- "language_model.model.layers.27.mlp.up_proj": {
358
- "group_size": 64,
359
- "bits": 8
360
- },
361
- "language_model.model.layers.28.mlp.gate_proj": {
362
- "group_size": 64,
363
- "bits": 8
364
- },
365
- "language_model.model.layers.28.mlp.down_proj": {
366
- "group_size": 64,
367
- "bits": 8
368
- },
369
- "language_model.model.layers.28.mlp.up_proj": {
370
- "group_size": 64,
371
- "bits": 8
372
- },
373
- "language_model.model.layers.29.mlp.gate_proj": {
374
- "group_size": 64,
375
- "bits": 8
376
- },
377
- "language_model.model.layers.29.mlp.down_proj": {
378
- "group_size": 64,
379
- "bits": 8
380
- },
381
- "language_model.model.layers.29.mlp.up_proj": {
382
- "group_size": 64,
383
- "bits": 8
384
- },
385
- "language_model.model.layers.30.mlp.gate_proj": {
386
- "group_size": 64,
387
- "bits": 8
388
- },
389
- "language_model.model.layers.30.mlp.down_proj": {
390
- "group_size": 64,
391
- "bits": 8
392
- },
393
- "language_model.model.layers.30.mlp.up_proj": {
394
- "group_size": 64,
395
- "bits": 8
396
- },
397
- "language_model.model.layers.31.mlp.gate_proj": {
398
- "group_size": 64,
399
- "bits": 8
400
- },
401
- "language_model.model.layers.31.mlp.down_proj": {
402
- "group_size": 64,
403
- "bits": 8
404
- },
405
- "language_model.model.layers.31.mlp.up_proj": {
406
- "group_size": 64,
407
- "bits": 8
408
- },
409
- "language_model.model.layers.32.mlp.gate_proj": {
410
- "group_size": 64,
411
- "bits": 8
412
- },
413
- "language_model.model.layers.32.mlp.down_proj": {
414
- "group_size": 64,
415
- "bits": 8
416
- },
417
- "language_model.model.layers.32.mlp.up_proj": {
418
- "group_size": 64,
419
- "bits": 8
420
- },
421
- "language_model.model.layers.33.mlp.gate_proj": {
422
- "group_size": 64,
423
- "bits": 8
424
- },
425
- "language_model.model.layers.33.mlp.down_proj": {
426
- "group_size": 64,
427
- "bits": 8
428
- },
429
- "language_model.model.layers.33.mlp.up_proj": {
430
- "group_size": 64,
431
- "bits": 8
432
- },
433
- "language_model.model.layers.34.mlp.gate_proj": {
434
- "group_size": 64,
435
- "bits": 8
436
- },
437
- "language_model.model.layers.34.mlp.down_proj": {
438
- "group_size": 64,
439
- "bits": 8
440
- },
441
- "language_model.model.layers.34.mlp.up_proj": {
442
- "group_size": 64,
443
- "bits": 8
444
- },
445
- "language_model.model.layers.35.mlp.gate_proj": {
446
- "group_size": 64,
447
- "bits": 8
448
- },
449
- "language_model.model.layers.35.mlp.down_proj": {
450
- "group_size": 64,
451
- "bits": 8
452
- },
453
- "language_model.model.layers.35.mlp.up_proj": {
454
- "group_size": 64,
455
- "bits": 8
456
- },
457
- "language_model.model.layers.36.mlp.gate_proj": {
458
- "group_size": 64,
459
- "bits": 8
460
- },
461
- "language_model.model.layers.36.mlp.down_proj": {
462
- "group_size": 64,
463
- "bits": 8
464
- },
465
- "language_model.model.layers.36.mlp.up_proj": {
466
- "group_size": 64,
467
- "bits": 8
468
- },
469
- "language_model.model.layers.37.mlp.gate_proj": {
470
- "group_size": 64,
471
- "bits": 8
472
- },
473
- "language_model.model.layers.37.mlp.down_proj": {
474
- "group_size": 64,
475
- "bits": 8
476
- },
477
- "language_model.model.layers.37.mlp.up_proj": {
478
- "group_size": 64,
479
- "bits": 8
480
- },
481
- "language_model.model.layers.38.mlp.gate_proj": {
482
- "group_size": 64,
483
- "bits": 8
484
- },
485
- "language_model.model.layers.38.mlp.down_proj": {
486
- "group_size": 64,
487
- "bits": 8
488
- },
489
- "language_model.model.layers.38.mlp.up_proj": {
490
- "group_size": 64,
491
- "bits": 8
492
- },
493
- "language_model.model.layers.39.mlp.gate_proj": {
494
- "group_size": 64,
495
- "bits": 8
496
- },
497
- "language_model.model.layers.39.mlp.down_proj": {
498
- "group_size": 64,
499
- "bits": 8
500
- },
501
- "language_model.model.layers.39.mlp.up_proj": {
502
- "group_size": 64,
503
- "bits": 8
504
- },
505
- "language_model.model.layers.40.mlp.gate_proj": {
506
- "group_size": 64,
507
- "bits": 8
508
- },
509
- "language_model.model.layers.40.mlp.down_proj": {
510
- "group_size": 64,
511
- "bits": 8
512
- },
513
- "language_model.model.layers.40.mlp.up_proj": {
514
- "group_size": 64,
515
- "bits": 8
516
- },
517
- "language_model.model.layers.41.mlp.gate_proj": {
518
- "group_size": 64,
519
- "bits": 8
520
- },
521
- "language_model.model.layers.41.mlp.down_proj": {
522
- "group_size": 64,
523
- "bits": 8
524
- },
525
- "language_model.model.layers.41.mlp.up_proj": {
526
- "group_size": 64,
527
- "bits": 8
528
- },
529
- "language_model.model.layers.42.mlp.gate_proj": {
530
- "group_size": 64,
531
- "bits": 8
532
- },
533
- "language_model.model.layers.42.mlp.down_proj": {
534
- "group_size": 64,
535
- "bits": 8
536
- },
537
- "language_model.model.layers.42.mlp.up_proj": {
538
- "group_size": 64,
539
- "bits": 8
540
- },
541
- "language_model.model.layers.43.mlp.gate_proj": {
542
- "group_size": 64,
543
- "bits": 8
544
- },
545
- "language_model.model.layers.43.mlp.down_proj": {
546
- "group_size": 64,
547
- "bits": 8
548
- },
549
- "language_model.model.layers.43.mlp.up_proj": {
550
- "group_size": 64,
551
- "bits": 8
552
- },
553
- "language_model.model.layers.44.mlp.gate_proj": {
554
- "group_size": 64,
555
- "bits": 8
556
- },
557
- "language_model.model.layers.44.mlp.down_proj": {
558
- "group_size": 64,
559
- "bits": 8
560
- },
561
- "language_model.model.layers.44.mlp.up_proj": {
562
- "group_size": 64,
563
- "bits": 8
564
- },
565
- "language_model.model.layers.45.mlp.gate_proj": {
566
- "group_size": 64,
567
- "bits": 8
568
- },
569
- "language_model.model.layers.45.mlp.down_proj": {
570
- "group_size": 64,
571
- "bits": 8
572
- },
573
- "language_model.model.layers.45.mlp.up_proj": {
574
- "group_size": 64,
575
- "bits": 8
576
- },
577
- "language_model.model.layers.46.mlp.gate_proj": {
578
- "group_size": 64,
579
- "bits": 8
580
- },
581
- "language_model.model.layers.46.mlp.down_proj": {
582
- "group_size": 64,
583
- "bits": 8
584
- },
585
- "language_model.model.layers.46.mlp.up_proj": {
586
- "group_size": 64,
587
- "bits": 8
588
- },
589
- "language_model.model.layers.47.mlp.gate_proj": {
590
- "group_size": 64,
591
- "bits": 8
592
- },
593
- "language_model.model.layers.47.mlp.down_proj": {
594
- "group_size": 64,
595
- "bits": 8
596
- },
597
- "language_model.model.layers.47.mlp.up_proj": {
598
- "group_size": 64,
599
- "bits": 8
600
- },
601
- "language_model.model.layers.48.mlp.gate_proj": {
602
- "group_size": 64,
603
- "bits": 8
604
- },
605
- "language_model.model.layers.48.mlp.down_proj": {
606
- "group_size": 64,
607
- "bits": 8
608
- },
609
- "language_model.model.layers.48.mlp.up_proj": {
610
- "group_size": 64,
611
- "bits": 8
612
- },
613
- "language_model.model.layers.49.mlp.gate_proj": {
614
- "group_size": 64,
615
- "bits": 8
616
- },
617
- "language_model.model.layers.49.mlp.down_proj": {
618
- "group_size": 64,
619
- "bits": 8
620
- },
621
- "language_model.model.layers.49.mlp.up_proj": {
622
- "group_size": 64,
623
- "bits": 8
624
- },
625
- "language_model.model.layers.50.mlp.gate_proj": {
626
- "group_size": 64,
627
- "bits": 8
628
- },
629
- "language_model.model.layers.50.mlp.down_proj": {
630
- "group_size": 64,
631
- "bits": 8
632
- },
633
- "language_model.model.layers.50.mlp.up_proj": {
634
- "group_size": 64,
635
- "bits": 8
636
- },
637
- "language_model.model.layers.51.mlp.gate_proj": {
638
- "group_size": 64,
639
- "bits": 8
640
- },
641
- "language_model.model.layers.51.mlp.down_proj": {
642
- "group_size": 64,
643
- "bits": 8
644
- },
645
- "language_model.model.layers.51.mlp.up_proj": {
646
- "group_size": 64,
647
- "bits": 8
648
- },
649
- "language_model.model.layers.52.mlp.gate_proj": {
650
- "group_size": 64,
651
- "bits": 8
652
- },
653
- "language_model.model.layers.52.mlp.down_proj": {
654
- "group_size": 64,
655
- "bits": 8
656
- },
657
- "language_model.model.layers.52.mlp.up_proj": {
658
- "group_size": 64,
659
- "bits": 8
660
- },
661
- "language_model.model.layers.53.mlp.gate_proj": {
662
- "group_size": 64,
663
- "bits": 8
664
- },
665
- "language_model.model.layers.53.mlp.down_proj": {
666
- "group_size": 64,
667
- "bits": 8
668
- },
669
- "language_model.model.layers.53.mlp.up_proj": {
670
- "group_size": 64,
671
- "bits": 8
672
- },
673
- "language_model.model.layers.54.mlp.gate_proj": {
674
- "group_size": 64,
675
- "bits": 8
676
- },
677
- "language_model.model.layers.54.mlp.down_proj": {
678
- "group_size": 64,
679
- "bits": 8
680
- },
681
- "language_model.model.layers.54.mlp.up_proj": {
682
- "group_size": 64,
683
- "bits": 8
684
- },
685
- "language_model.model.layers.55.mlp.gate_proj": {
686
- "group_size": 64,
687
- "bits": 8
688
- },
689
- "language_model.model.layers.55.mlp.down_proj": {
690
- "group_size": 64,
691
- "bits": 8
692
- },
693
- "language_model.model.layers.55.mlp.up_proj": {
694
- "group_size": 64,
695
- "bits": 8
696
- },
697
- "language_model.model.layers.56.mlp.gate_proj": {
698
- "group_size": 64,
699
- "bits": 8
700
- },
701
- "language_model.model.layers.56.mlp.down_proj": {
702
- "group_size": 64,
703
- "bits": 8
704
- },
705
- "language_model.model.layers.56.mlp.up_proj": {
706
- "group_size": 64,
707
- "bits": 8
708
- },
709
- "language_model.model.layers.57.mlp.gate_proj": {
710
- "group_size": 64,
711
- "bits": 8
712
- },
713
- "language_model.model.layers.57.mlp.down_proj": {
714
- "group_size": 64,
715
- "bits": 8
716
- },
717
- "language_model.model.layers.57.mlp.up_proj": {
718
- "group_size": 64,
719
- "bits": 8
720
- },
721
- "language_model.model.layers.58.mlp.gate_proj": {
722
- "group_size": 64,
723
- "bits": 8
724
- },
725
- "language_model.model.layers.58.mlp.down_proj": {
726
- "group_size": 64,
727
- "bits": 8
728
- },
729
- "language_model.model.layers.58.mlp.up_proj": {
730
- "group_size": 64,
731
- "bits": 8
732
- },
733
- "language_model.model.layers.59.mlp.gate_proj": {
734
- "group_size": 64,
735
- "bits": 8
736
- },
737
- "language_model.model.layers.59.mlp.down_proj": {
738
- "group_size": 64,
739
- "bits": 8
740
- },
741
- "language_model.model.layers.59.mlp.up_proj": {
742
- "group_size": 64,
743
- "bits": 8
744
- }
745
  },
746
- "quantization_config": {
747
- "group_size": 64,
748
- "bits": 4,
749
- "mode": "affine",
750
- "language_model.model.layers.0.mlp.gate_proj": {
751
- "group_size": 64,
752
- "bits": 8
753
- },
754
- "language_model.model.layers.0.mlp.down_proj": {
755
- "group_size": 64,
756
- "bits": 8
757
- },
758
- "language_model.model.layers.0.mlp.up_proj": {
759
- "group_size": 64,
760
- "bits": 8
761
- },
762
- "language_model.model.layers.1.mlp.gate_proj": {
763
- "group_size": 64,
764
- "bits": 8
765
- },
766
- "language_model.model.layers.1.mlp.down_proj": {
767
- "group_size": 64,
768
- "bits": 8
769
- },
770
- "language_model.model.layers.1.mlp.up_proj": {
771
- "group_size": 64,
772
- "bits": 8
773
- },
774
- "language_model.model.layers.2.mlp.gate_proj": {
775
- "group_size": 64,
776
- "bits": 8
777
- },
778
- "language_model.model.layers.2.mlp.down_proj": {
779
- "group_size": 64,
780
- "bits": 8
781
- },
782
- "language_model.model.layers.2.mlp.up_proj": {
783
- "group_size": 64,
784
- "bits": 8
785
- },
786
- "language_model.model.layers.3.mlp.gate_proj": {
787
- "group_size": 64,
788
- "bits": 8
789
- },
790
- "language_model.model.layers.3.mlp.down_proj": {
791
- "group_size": 64,
792
- "bits": 8
793
- },
794
- "language_model.model.layers.3.mlp.up_proj": {
795
- "group_size": 64,
796
- "bits": 8
797
- },
798
- "language_model.model.layers.4.mlp.gate_proj": {
799
- "group_size": 64,
800
- "bits": 8
801
- },
802
- "language_model.model.layers.4.mlp.down_proj": {
803
- "group_size": 64,
804
- "bits": 8
805
- },
806
- "language_model.model.layers.4.mlp.up_proj": {
807
- "group_size": 64,
808
- "bits": 8
809
- },
810
- "language_model.model.layers.5.mlp.gate_proj": {
811
- "group_size": 64,
812
- "bits": 8
813
- },
814
- "language_model.model.layers.5.mlp.down_proj": {
815
- "group_size": 64,
816
- "bits": 8
817
- },
818
- "language_model.model.layers.5.mlp.up_proj": {
819
- "group_size": 64,
820
- "bits": 8
821
- },
822
- "language_model.model.layers.6.mlp.gate_proj": {
823
- "group_size": 64,
824
- "bits": 8
825
- },
826
- "language_model.model.layers.6.mlp.down_proj": {
827
- "group_size": 64,
828
- "bits": 8
829
- },
830
- "language_model.model.layers.6.mlp.up_proj": {
831
- "group_size": 64,
832
- "bits": 8
833
- },
834
- "language_model.model.layers.7.mlp.gate_proj": {
835
- "group_size": 64,
836
- "bits": 8
837
- },
838
- "language_model.model.layers.7.mlp.down_proj": {
839
- "group_size": 64,
840
- "bits": 8
841
- },
842
- "language_model.model.layers.7.mlp.up_proj": {
843
- "group_size": 64,
844
- "bits": 8
845
- },
846
- "language_model.model.layers.8.mlp.gate_proj": {
847
- "group_size": 64,
848
- "bits": 8
849
- },
850
- "language_model.model.layers.8.mlp.down_proj": {
851
- "group_size": 64,
852
- "bits": 8
853
- },
854
- "language_model.model.layers.8.mlp.up_proj": {
855
- "group_size": 64,
856
- "bits": 8
857
- },
858
- "language_model.model.layers.9.mlp.gate_proj": {
859
- "group_size": 64,
860
- "bits": 8
861
- },
862
- "language_model.model.layers.9.mlp.down_proj": {
863
- "group_size": 64,
864
- "bits": 8
865
- },
866
- "language_model.model.layers.9.mlp.up_proj": {
867
- "group_size": 64,
868
- "bits": 8
869
- },
870
- "language_model.model.layers.10.mlp.gate_proj": {
871
- "group_size": 64,
872
- "bits": 8
873
- },
874
- "language_model.model.layers.10.mlp.down_proj": {
875
- "group_size": 64,
876
- "bits": 8
877
- },
878
- "language_model.model.layers.10.mlp.up_proj": {
879
- "group_size": 64,
880
- "bits": 8
881
- },
882
- "language_model.model.layers.11.mlp.gate_proj": {
883
- "group_size": 64,
884
- "bits": 8
885
- },
886
- "language_model.model.layers.11.mlp.down_proj": {
887
- "group_size": 64,
888
- "bits": 8
889
- },
890
- "language_model.model.layers.11.mlp.up_proj": {
891
- "group_size": 64,
892
- "bits": 8
893
- },
894
- "language_model.model.layers.12.mlp.gate_proj": {
895
- "group_size": 64,
896
- "bits": 8
897
- },
898
- "language_model.model.layers.12.mlp.down_proj": {
899
- "group_size": 64,
900
- "bits": 8
901
- },
902
- "language_model.model.layers.12.mlp.up_proj": {
903
- "group_size": 64,
904
- "bits": 8
905
- },
906
- "language_model.model.layers.13.mlp.gate_proj": {
907
- "group_size": 64,
908
- "bits": 8
909
- },
910
- "language_model.model.layers.13.mlp.down_proj": {
911
- "group_size": 64,
912
- "bits": 8
913
- },
914
- "language_model.model.layers.13.mlp.up_proj": {
915
- "group_size": 64,
916
- "bits": 8
917
- },
918
- "language_model.model.layers.14.mlp.gate_proj": {
919
- "group_size": 64,
920
- "bits": 8
921
- },
922
- "language_model.model.layers.14.mlp.down_proj": {
923
- "group_size": 64,
924
- "bits": 8
925
- },
926
- "language_model.model.layers.14.mlp.up_proj": {
927
- "group_size": 64,
928
- "bits": 8
929
- },
930
- "language_model.model.layers.15.mlp.gate_proj": {
931
- "group_size": 64,
932
- "bits": 8
933
- },
934
- "language_model.model.layers.15.mlp.down_proj": {
935
- "group_size": 64,
936
- "bits": 8
937
- },
938
- "language_model.model.layers.15.mlp.up_proj": {
939
- "group_size": 64,
940
- "bits": 8
941
- },
942
- "language_model.model.layers.16.mlp.gate_proj": {
943
- "group_size": 64,
944
- "bits": 8
945
- },
946
- "language_model.model.layers.16.mlp.down_proj": {
947
- "group_size": 64,
948
- "bits": 8
949
- },
950
- "language_model.model.layers.16.mlp.up_proj": {
951
- "group_size": 64,
952
- "bits": 8
953
- },
954
- "language_model.model.layers.17.mlp.gate_proj": {
955
- "group_size": 64,
956
- "bits": 8
957
- },
958
- "language_model.model.layers.17.mlp.down_proj": {
959
- "group_size": 64,
960
- "bits": 8
961
- },
962
- "language_model.model.layers.17.mlp.up_proj": {
963
- "group_size": 64,
964
- "bits": 8
965
- },
966
- "language_model.model.layers.18.mlp.gate_proj": {
967
- "group_size": 64,
968
- "bits": 8
969
- },
970
- "language_model.model.layers.18.mlp.down_proj": {
971
- "group_size": 64,
972
- "bits": 8
973
- },
974
- "language_model.model.layers.18.mlp.up_proj": {
975
- "group_size": 64,
976
- "bits": 8
977
- },
978
- "language_model.model.layers.19.mlp.gate_proj": {
979
- "group_size": 64,
980
- "bits": 8
981
- },
982
- "language_model.model.layers.19.mlp.down_proj": {
983
- "group_size": 64,
984
- "bits": 8
985
- },
986
- "language_model.model.layers.19.mlp.up_proj": {
987
- "group_size": 64,
988
- "bits": 8
989
- },
990
- "language_model.model.layers.20.mlp.gate_proj": {
991
- "group_size": 64,
992
- "bits": 8
993
- },
994
- "language_model.model.layers.20.mlp.down_proj": {
995
- "group_size": 64,
996
- "bits": 8
997
- },
998
- "language_model.model.layers.20.mlp.up_proj": {
999
- "group_size": 64,
1000
- "bits": 8
1001
- },
1002
- "language_model.model.layers.21.mlp.gate_proj": {
1003
- "group_size": 64,
1004
- "bits": 8
1005
- },
1006
- "language_model.model.layers.21.mlp.down_proj": {
1007
- "group_size": 64,
1008
- "bits": 8
1009
- },
1010
- "language_model.model.layers.21.mlp.up_proj": {
1011
- "group_size": 64,
1012
- "bits": 8
1013
- },
1014
- "language_model.model.layers.22.mlp.gate_proj": {
1015
- "group_size": 64,
1016
- "bits": 8
1017
- },
1018
- "language_model.model.layers.22.mlp.down_proj": {
1019
- "group_size": 64,
1020
- "bits": 8
1021
- },
1022
- "language_model.model.layers.22.mlp.up_proj": {
1023
- "group_size": 64,
1024
- "bits": 8
1025
- },
1026
- "language_model.model.layers.23.mlp.gate_proj": {
1027
- "group_size": 64,
1028
- "bits": 8
1029
- },
1030
- "language_model.model.layers.23.mlp.down_proj": {
1031
- "group_size": 64,
1032
- "bits": 8
1033
- },
1034
- "language_model.model.layers.23.mlp.up_proj": {
1035
- "group_size": 64,
1036
- "bits": 8
1037
- },
1038
- "language_model.model.layers.24.mlp.gate_proj": {
1039
- "group_size": 64,
1040
- "bits": 8
1041
- },
1042
- "language_model.model.layers.24.mlp.down_proj": {
1043
- "group_size": 64,
1044
- "bits": 8
1045
- },
1046
- "language_model.model.layers.24.mlp.up_proj": {
1047
- "group_size": 64,
1048
- "bits": 8
1049
- },
1050
- "language_model.model.layers.25.mlp.gate_proj": {
1051
- "group_size": 64,
1052
- "bits": 8
1053
- },
1054
- "language_model.model.layers.25.mlp.down_proj": {
1055
- "group_size": 64,
1056
- "bits": 8
1057
- },
1058
- "language_model.model.layers.25.mlp.up_proj": {
1059
- "group_size": 64,
1060
- "bits": 8
1061
- },
1062
- "language_model.model.layers.26.mlp.gate_proj": {
1063
- "group_size": 64,
1064
- "bits": 8
1065
- },
1066
- "language_model.model.layers.26.mlp.down_proj": {
1067
- "group_size": 64,
1068
- "bits": 8
1069
- },
1070
- "language_model.model.layers.26.mlp.up_proj": {
1071
- "group_size": 64,
1072
- "bits": 8
1073
- },
1074
- "language_model.model.layers.27.mlp.gate_proj": {
1075
- "group_size": 64,
1076
- "bits": 8
1077
- },
1078
- "language_model.model.layers.27.mlp.down_proj": {
1079
- "group_size": 64,
1080
- "bits": 8
1081
- },
1082
- "language_model.model.layers.27.mlp.up_proj": {
1083
- "group_size": 64,
1084
- "bits": 8
1085
- },
1086
- "language_model.model.layers.28.mlp.gate_proj": {
1087
- "group_size": 64,
1088
- "bits": 8
1089
- },
1090
- "language_model.model.layers.28.mlp.down_proj": {
1091
- "group_size": 64,
1092
- "bits": 8
1093
- },
1094
- "language_model.model.layers.28.mlp.up_proj": {
1095
- "group_size": 64,
1096
- "bits": 8
1097
- },
1098
- "language_model.model.layers.29.mlp.gate_proj": {
1099
- "group_size": 64,
1100
- "bits": 8
1101
- },
1102
- "language_model.model.layers.29.mlp.down_proj": {
1103
- "group_size": 64,
1104
- "bits": 8
1105
- },
1106
- "language_model.model.layers.29.mlp.up_proj": {
1107
- "group_size": 64,
1108
- "bits": 8
1109
- },
1110
- "language_model.model.layers.30.mlp.gate_proj": {
1111
- "group_size": 64,
1112
- "bits": 8
1113
- },
1114
- "language_model.model.layers.30.mlp.down_proj": {
1115
- "group_size": 64,
1116
- "bits": 8
1117
- },
1118
- "language_model.model.layers.30.mlp.up_proj": {
1119
- "group_size": 64,
1120
- "bits": 8
1121
- },
1122
- "language_model.model.layers.31.mlp.gate_proj": {
1123
- "group_size": 64,
1124
- "bits": 8
1125
- },
1126
- "language_model.model.layers.31.mlp.down_proj": {
1127
- "group_size": 64,
1128
- "bits": 8
1129
- },
1130
- "language_model.model.layers.31.mlp.up_proj": {
1131
- "group_size": 64,
1132
- "bits": 8
1133
- },
1134
- "language_model.model.layers.32.mlp.gate_proj": {
1135
- "group_size": 64,
1136
- "bits": 8
1137
- },
1138
- "language_model.model.layers.32.mlp.down_proj": {
1139
- "group_size": 64,
1140
- "bits": 8
1141
- },
1142
- "language_model.model.layers.32.mlp.up_proj": {
1143
- "group_size": 64,
1144
- "bits": 8
1145
- },
1146
- "language_model.model.layers.33.mlp.gate_proj": {
1147
- "group_size": 64,
1148
- "bits": 8
1149
- },
1150
- "language_model.model.layers.33.mlp.down_proj": {
1151
- "group_size": 64,
1152
- "bits": 8
1153
- },
1154
- "language_model.model.layers.33.mlp.up_proj": {
1155
- "group_size": 64,
1156
- "bits": 8
1157
- },
1158
- "language_model.model.layers.34.mlp.gate_proj": {
1159
- "group_size": 64,
1160
- "bits": 8
1161
- },
1162
- "language_model.model.layers.34.mlp.down_proj": {
1163
- "group_size": 64,
1164
- "bits": 8
1165
- },
1166
- "language_model.model.layers.34.mlp.up_proj": {
1167
- "group_size": 64,
1168
- "bits": 8
1169
- },
1170
- "language_model.model.layers.35.mlp.gate_proj": {
1171
- "group_size": 64,
1172
- "bits": 8
1173
- },
1174
- "language_model.model.layers.35.mlp.down_proj": {
1175
- "group_size": 64,
1176
- "bits": 8
1177
- },
1178
- "language_model.model.layers.35.mlp.up_proj": {
1179
- "group_size": 64,
1180
- "bits": 8
1181
- },
1182
- "language_model.model.layers.36.mlp.gate_proj": {
1183
- "group_size": 64,
1184
- "bits": 8
1185
- },
1186
- "language_model.model.layers.36.mlp.down_proj": {
1187
- "group_size": 64,
1188
- "bits": 8
1189
- },
1190
- "language_model.model.layers.36.mlp.up_proj": {
1191
- "group_size": 64,
1192
- "bits": 8
1193
- },
1194
- "language_model.model.layers.37.mlp.gate_proj": {
1195
- "group_size": 64,
1196
- "bits": 8
1197
- },
1198
- "language_model.model.layers.37.mlp.down_proj": {
1199
- "group_size": 64,
1200
- "bits": 8
1201
- },
1202
- "language_model.model.layers.37.mlp.up_proj": {
1203
- "group_size": 64,
1204
- "bits": 8
1205
- },
1206
- "language_model.model.layers.38.mlp.gate_proj": {
1207
- "group_size": 64,
1208
- "bits": 8
1209
- },
1210
- "language_model.model.layers.38.mlp.down_proj": {
1211
- "group_size": 64,
1212
- "bits": 8
1213
- },
1214
- "language_model.model.layers.38.mlp.up_proj": {
1215
- "group_size": 64,
1216
- "bits": 8
1217
- },
1218
- "language_model.model.layers.39.mlp.gate_proj": {
1219
- "group_size": 64,
1220
- "bits": 8
1221
- },
1222
- "language_model.model.layers.39.mlp.down_proj": {
1223
- "group_size": 64,
1224
- "bits": 8
1225
- },
1226
- "language_model.model.layers.39.mlp.up_proj": {
1227
- "group_size": 64,
1228
- "bits": 8
1229
- },
1230
- "language_model.model.layers.40.mlp.gate_proj": {
1231
- "group_size": 64,
1232
- "bits": 8
1233
- },
1234
- "language_model.model.layers.40.mlp.down_proj": {
1235
- "group_size": 64,
1236
- "bits": 8
1237
- },
1238
- "language_model.model.layers.40.mlp.up_proj": {
1239
- "group_size": 64,
1240
- "bits": 8
1241
- },
1242
- "language_model.model.layers.41.mlp.gate_proj": {
1243
- "group_size": 64,
1244
- "bits": 8
1245
- },
1246
- "language_model.model.layers.41.mlp.down_proj": {
1247
- "group_size": 64,
1248
- "bits": 8
1249
- },
1250
- "language_model.model.layers.41.mlp.up_proj": {
1251
- "group_size": 64,
1252
- "bits": 8
1253
- },
1254
- "language_model.model.layers.42.mlp.gate_proj": {
1255
- "group_size": 64,
1256
- "bits": 8
1257
- },
1258
- "language_model.model.layers.42.mlp.down_proj": {
1259
- "group_size": 64,
1260
- "bits": 8
1261
- },
1262
- "language_model.model.layers.42.mlp.up_proj": {
1263
- "group_size": 64,
1264
- "bits": 8
1265
- },
1266
- "language_model.model.layers.43.mlp.gate_proj": {
1267
- "group_size": 64,
1268
- "bits": 8
1269
- },
1270
- "language_model.model.layers.43.mlp.down_proj": {
1271
- "group_size": 64,
1272
- "bits": 8
1273
- },
1274
- "language_model.model.layers.43.mlp.up_proj": {
1275
- "group_size": 64,
1276
- "bits": 8
1277
- },
1278
- "language_model.model.layers.44.mlp.gate_proj": {
1279
- "group_size": 64,
1280
- "bits": 8
1281
- },
1282
- "language_model.model.layers.44.mlp.down_proj": {
1283
- "group_size": 64,
1284
- "bits": 8
1285
- },
1286
- "language_model.model.layers.44.mlp.up_proj": {
1287
- "group_size": 64,
1288
- "bits": 8
1289
- },
1290
- "language_model.model.layers.45.mlp.gate_proj": {
1291
- "group_size": 64,
1292
- "bits": 8
1293
- },
1294
- "language_model.model.layers.45.mlp.down_proj": {
1295
- "group_size": 64,
1296
- "bits": 8
1297
- },
1298
- "language_model.model.layers.45.mlp.up_proj": {
1299
- "group_size": 64,
1300
- "bits": 8
1301
- },
1302
- "language_model.model.layers.46.mlp.gate_proj": {
1303
- "group_size": 64,
1304
- "bits": 8
1305
- },
1306
- "language_model.model.layers.46.mlp.down_proj": {
1307
- "group_size": 64,
1308
- "bits": 8
1309
- },
1310
- "language_model.model.layers.46.mlp.up_proj": {
1311
- "group_size": 64,
1312
- "bits": 8
1313
- },
1314
- "language_model.model.layers.47.mlp.gate_proj": {
1315
- "group_size": 64,
1316
- "bits": 8
1317
- },
1318
- "language_model.model.layers.47.mlp.down_proj": {
1319
- "group_size": 64,
1320
- "bits": 8
1321
- },
1322
- "language_model.model.layers.47.mlp.up_proj": {
1323
- "group_size": 64,
1324
- "bits": 8
1325
- },
1326
- "language_model.model.layers.48.mlp.gate_proj": {
1327
- "group_size": 64,
1328
- "bits": 8
1329
- },
1330
- "language_model.model.layers.48.mlp.down_proj": {
1331
- "group_size": 64,
1332
- "bits": 8
1333
- },
1334
- "language_model.model.layers.48.mlp.up_proj": {
1335
- "group_size": 64,
1336
- "bits": 8
1337
- },
1338
- "language_model.model.layers.49.mlp.gate_proj": {
1339
- "group_size": 64,
1340
- "bits": 8
1341
- },
1342
- "language_model.model.layers.49.mlp.down_proj": {
1343
- "group_size": 64,
1344
- "bits": 8
1345
- },
1346
- "language_model.model.layers.49.mlp.up_proj": {
1347
- "group_size": 64,
1348
- "bits": 8
1349
- },
1350
- "language_model.model.layers.50.mlp.gate_proj": {
1351
- "group_size": 64,
1352
- "bits": 8
1353
- },
1354
- "language_model.model.layers.50.mlp.down_proj": {
1355
- "group_size": 64,
1356
- "bits": 8
1357
- },
1358
- "language_model.model.layers.50.mlp.up_proj": {
1359
- "group_size": 64,
1360
- "bits": 8
1361
- },
1362
- "language_model.model.layers.51.mlp.gate_proj": {
1363
- "group_size": 64,
1364
- "bits": 8
1365
- },
1366
- "language_model.model.layers.51.mlp.down_proj": {
1367
- "group_size": 64,
1368
- "bits": 8
1369
- },
1370
- "language_model.model.layers.51.mlp.up_proj": {
1371
- "group_size": 64,
1372
- "bits": 8
1373
- },
1374
- "language_model.model.layers.52.mlp.gate_proj": {
1375
- "group_size": 64,
1376
- "bits": 8
1377
- },
1378
- "language_model.model.layers.52.mlp.down_proj": {
1379
- "group_size": 64,
1380
- "bits": 8
1381
- },
1382
- "language_model.model.layers.52.mlp.up_proj": {
1383
- "group_size": 64,
1384
- "bits": 8
1385
- },
1386
- "language_model.model.layers.53.mlp.gate_proj": {
1387
- "group_size": 64,
1388
- "bits": 8
1389
- },
1390
- "language_model.model.layers.53.mlp.down_proj": {
1391
- "group_size": 64,
1392
- "bits": 8
1393
- },
1394
- "language_model.model.layers.53.mlp.up_proj": {
1395
- "group_size": 64,
1396
- "bits": 8
1397
- },
1398
- "language_model.model.layers.54.mlp.gate_proj": {
1399
- "group_size": 64,
1400
- "bits": 8
1401
- },
1402
- "language_model.model.layers.54.mlp.down_proj": {
1403
- "group_size": 64,
1404
- "bits": 8
1405
- },
1406
- "language_model.model.layers.54.mlp.up_proj": {
1407
- "group_size": 64,
1408
- "bits": 8
1409
- },
1410
- "language_model.model.layers.55.mlp.gate_proj": {
1411
- "group_size": 64,
1412
- "bits": 8
1413
- },
1414
- "language_model.model.layers.55.mlp.down_proj": {
1415
- "group_size": 64,
1416
- "bits": 8
1417
- },
1418
- "language_model.model.layers.55.mlp.up_proj": {
1419
- "group_size": 64,
1420
- "bits": 8
1421
- },
1422
- "language_model.model.layers.56.mlp.gate_proj": {
1423
- "group_size": 64,
1424
- "bits": 8
1425
- },
1426
- "language_model.model.layers.56.mlp.down_proj": {
1427
- "group_size": 64,
1428
- "bits": 8
1429
- },
1430
- "language_model.model.layers.56.mlp.up_proj": {
1431
- "group_size": 64,
1432
- "bits": 8
1433
- },
1434
- "language_model.model.layers.57.mlp.gate_proj": {
1435
- "group_size": 64,
1436
- "bits": 8
1437
- },
1438
- "language_model.model.layers.57.mlp.down_proj": {
1439
- "group_size": 64,
1440
- "bits": 8
1441
- },
1442
- "language_model.model.layers.57.mlp.up_proj": {
1443
- "group_size": 64,
1444
- "bits": 8
1445
- },
1446
- "language_model.model.layers.58.mlp.gate_proj": {
1447
- "group_size": 64,
1448
- "bits": 8
1449
- },
1450
- "language_model.model.layers.58.mlp.down_proj": {
1451
- "group_size": 64,
1452
- "bits": 8
1453
- },
1454
- "language_model.model.layers.58.mlp.up_proj": {
1455
- "group_size": 64,
1456
- "bits": 8
1457
- },
1458
- "language_model.model.layers.59.mlp.gate_proj": {
1459
- "group_size": 64,
1460
- "bits": 8
1461
- },
1462
- "language_model.model.layers.59.mlp.down_proj": {
1463
- "group_size": 64,
1464
- "bits": 8
1465
- },
1466
- "language_model.model.layers.59.mlp.up_proj": {
1467
- "group_size": 64,
1468
- "bits": 8
1469
- }
1470
  },
1471
- "text_config": {
1472
- "attention_bias": false,
1473
- "attention_dropout": 0.0,
1474
- "attention_k_eq_v": true,
1475
- "bos_token_id": 2,
1476
- "dtype": "bfloat16",
1477
- "enable_moe_block": false,
1478
- "eos_token_id": 1,
1479
- "expert_intermediate_size": null,
1480
- "final_logit_softcapping": 30.0,
1481
- "global_head_dim": 512,
1482
- "head_dim": 256,
1483
- "hidden_activation": "gelu_pytorch_tanh",
1484
- "hidden_size": 5376,
1485
- "hidden_size_per_layer_input": 0,
1486
- "initializer_range": 0.02,
1487
- "intermediate_size": 21504,
1488
- "layer_types": [
1489
- "sliding_attention",
1490
- "sliding_attention",
1491
- "sliding_attention",
1492
- "sliding_attention",
1493
- "sliding_attention",
1494
- "full_attention",
1495
- "sliding_attention",
1496
- "sliding_attention",
1497
- "sliding_attention",
1498
- "sliding_attention",
1499
- "sliding_attention",
1500
- "full_attention",
1501
- "sliding_attention",
1502
- "sliding_attention",
1503
- "sliding_attention",
1504
- "sliding_attention",
1505
- "sliding_attention",
1506
- "full_attention",
1507
- "sliding_attention",
1508
- "sliding_attention",
1509
- "sliding_attention",
1510
- "sliding_attention",
1511
- "sliding_attention",
1512
- "full_attention",
1513
- "sliding_attention",
1514
- "sliding_attention",
1515
- "sliding_attention",
1516
- "sliding_attention",
1517
- "sliding_attention",
1518
- "full_attention",
1519
- "sliding_attention",
1520
- "sliding_attention",
1521
- "sliding_attention",
1522
- "sliding_attention",
1523
- "sliding_attention",
1524
- "full_attention",
1525
- "sliding_attention",
1526
- "sliding_attention",
1527
- "sliding_attention",
1528
- "sliding_attention",
1529
- "sliding_attention",
1530
- "full_attention",
1531
- "sliding_attention",
1532
- "sliding_attention",
1533
- "sliding_attention",
1534
- "sliding_attention",
1535
- "sliding_attention",
1536
- "full_attention",
1537
- "sliding_attention",
1538
- "sliding_attention",
1539
- "sliding_attention",
1540
- "sliding_attention",
1541
- "sliding_attention",
1542
- "full_attention",
1543
- "sliding_attention",
1544
- "sliding_attention",
1545
- "sliding_attention",
1546
- "sliding_attention",
1547
- "sliding_attention",
1548
- "full_attention"
1549
- ],
1550
- "max_position_embeddings": 262144,
1551
- "model_type": "gemma4_text",
1552
- "num_attention_heads": 32,
1553
- "num_experts": null,
1554
- "num_global_key_value_heads": 4,
1555
- "num_hidden_layers": 60,
1556
- "num_key_value_heads": 16,
1557
- "num_kv_shared_layers": 0,
1558
- "pad_token_id": 0,
1559
- "rms_norm_eps": 1e-06,
1560
- "rope_parameters": {
1561
- "full_attention": {
1562
- "partial_rotary_factor": 0.25,
1563
- "rope_theta": 1000000.0,
1564
- "rope_type": "proportional"
1565
- },
1566
- "sliding_attention": {
1567
- "rope_theta": 10000.0,
1568
- "rope_type": "default"
1569
- }
1570
- },
1571
- "sliding_window": 1024,
1572
- "tie_word_embeddings": true,
1573
- "top_k_experts": null,
1574
- "use_bidirectional_attention": "vision",
1575
- "use_cache": true,
1576
- "use_double_wide_mlp": false,
1577
- "vocab_size": 262144,
1578
- "vocab_size_per_layer_input": 262144
1579
  },
1580
- "tie_word_embeddings": true,
1581
- "transformers_version": "5.5.0.dev0",
1582
- "video_token_id": 258884,
1583
- "vision_config": {
1584
- "_name_or_path": "",
1585
- "architectures": null,
1586
- "attention_bias": false,
1587
- "attention_dropout": 0.0,
1588
- "chunk_size_feed_forward": 0,
1589
- "default_output_length": 280,
1590
- "dtype": "bfloat16",
1591
- "global_head_dim": 72,
1592
- "head_dim": 72,
1593
- "hidden_activation": "gelu_pytorch_tanh",
1594
- "hidden_size": 1152,
1595
- "id2label": {
1596
- "0": "LABEL_0",
1597
- "1": "LABEL_1"
1598
- },
1599
- "initializer_range": 0.02,
1600
- "intermediate_size": 4304,
1601
- "is_encoder_decoder": false,
1602
- "label2id": {
1603
- "LABEL_0": 0,
1604
- "LABEL_1": 1
1605
- },
1606
- "max_position_embeddings": 131072,
1607
- "model_type": "gemma4_vision",
1608
- "num_attention_heads": 16,
1609
- "num_hidden_layers": 27,
1610
- "num_key_value_heads": 16,
1611
- "output_attentions": false,
1612
- "output_hidden_states": false,
1613
- "patch_size": 16,
1614
- "pooling_kernel_size": 3,
1615
- "position_embedding_size": 10240,
1616
- "problem_type": null,
1617
- "return_dict": true,
1618
- "rms_norm_eps": 1e-06,
1619
- "rope_parameters": {
1620
- "rope_theta": 100.0,
1621
- "rope_type": "default"
1622
- },
1623
- "standardize": true,
1624
- "use_clipped_linears": false
1625
  },
1626
- "vision_soft_tokens_per_image": 280
1627
- }
 
 
 
 
1
  {
2
+ "architectures": [
3
+ "Gemma4ForConditionalGeneration"
4
+ ],
5
+ "audio_config": null,
6
+ "audio_token_id": 258881,
7
+ "boa_token_id": 256000,
8
+ "boi_token_id": 255999,
9
+ "dtype": "bfloat16",
10
+ "eoa_token_id": 258883,
11
+ "eoa_token_index": 258883,
12
+ "eoi_token_id": 258882,
13
+ "eos_token_id": [
14
+ 1,
15
+ 106
16
+ ],
17
+ "image_token_id": 258880,
18
+ "initializer_range": 0.02,
19
+ "model_type": "gemma4",
20
+ "text_config": {
21
+ "attention_bias": false,
22
+ "attention_dropout": 0.0,
23
+ "attention_k_eq_v": true,
24
+ "bos_token_id": 2,
25
  "dtype": "bfloat16",
26
+ "enable_moe_block": false,
27
+ "eos_token_id": 1,
28
+ "expert_intermediate_size": null,
29
+ "final_logit_softcapping": 30.0,
30
+ "global_head_dim": 512,
31
+ "head_dim": 256,
32
+ "hidden_activation": "gelu_pytorch_tanh",
33
+ "hidden_size": 5376,
34
+ "hidden_size_per_layer_input": 0,
35
  "initializer_range": 0.02,
36
+ "intermediate_size": 21504,
37
+ "layer_types": [
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "full_attention",
56
+ "sliding_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "sliding_attention",
60
+ "sliding_attention",
61
+ "full_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "sliding_attention",
65
+ "sliding_attention",
66
+ "sliding_attention",
67
+ "full_attention",
68
+ "sliding_attention",
69
+ "sliding_attention",
70
+ "sliding_attention",
71
+ "sliding_attention",
72
+ "sliding_attention",
73
+ "full_attention",
74
+ "sliding_attention",
75
+ "sliding_attention",
76
+ "sliding_attention",
77
+ "sliding_attention",
78
+ "sliding_attention",
79
+ "full_attention",
80
+ "sliding_attention",
81
+ "sliding_attention",
82
+ "sliding_attention",
83
+ "sliding_attention",
84
+ "sliding_attention",
85
+ "full_attention",
86
+ "sliding_attention",
87
+ "sliding_attention",
88
+ "sliding_attention",
89
+ "sliding_attention",
90
+ "sliding_attention",
91
+ "full_attention",
92
+ "sliding_attention",
93
+ "sliding_attention",
94
+ "sliding_attention",
95
+ "sliding_attention",
96
+ "sliding_attention",
97
+ "full_attention"
98
+ ],
99
+ "max_position_embeddings": 262144,
100
+ "model_type": "gemma4_text",
101
+ "num_attention_heads": 32,
102
+ "num_experts": null,
103
+ "num_global_key_value_heads": 4,
104
+ "num_hidden_layers": 60,
105
+ "num_key_value_heads": 16,
106
+ "num_kv_shared_layers": 0,
107
+ "pad_token_id": 0,
108
+ "rms_norm_eps": 1e-06,
109
+ "rope_parameters": {
110
+ "full_attention": {
111
+ "partial_rotary_factor": 0.25,
112
+ "rope_theta": 1000000.0,
113
+ "rope_type": "proportional"
114
+ },
115
+ "sliding_attention": {
116
+ "rope_theta": 10000.0,
117
+ "rope_type": "default"
118
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  },
120
+ "sliding_window": 1024,
121
+ "tie_word_embeddings": true,
122
+ "top_k_experts": null,
123
+ "use_bidirectional_attention": "vision",
124
+ "use_cache": true,
125
+ "use_double_wide_mlp": false,
126
+ "vocab_size": 262144,
127
+ "vocab_size_per_layer_input": 262144
128
+ },
129
+ "tie_word_embeddings": true,
130
+ "transformers_version": "5.5.0.dev0",
131
+ "video_token_id": 258884,
132
+ "vision_config": {
133
+ "_name_or_path": "",
134
+ "architectures": null,
135
+ "attention_bias": false,
136
+ "attention_dropout": 0.0,
137
+ "chunk_size_feed_forward": 0,
138
+ "default_output_length": 280,
139
+ "dtype": "bfloat16",
140
+ "global_head_dim": 72,
141
+ "head_dim": 72,
142
+ "hidden_activation": "gelu_pytorch_tanh",
143
+ "hidden_size": 1152,
144
+ "id2label": {
145
+ "0": "LABEL_0",
146
+ "1": "LABEL_1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  },
148
+ "initializer_range": 0.02,
149
+ "intermediate_size": 4304,
150
+ "is_encoder_decoder": false,
151
+ "label2id": {
152
+ "LABEL_0": 0,
153
+ "LABEL_1": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  },
155
+ "max_position_embeddings": 131072,
156
+ "model_type": "gemma4_vision",
157
+ "num_attention_heads": 16,
158
+ "num_hidden_layers": 27,
159
+ "num_key_value_heads": 16,
160
+ "output_attentions": false,
161
+ "output_hidden_states": false,
162
+ "patch_size": 16,
163
+ "pooling_kernel_size": 3,
164
+ "position_embedding_size": 10240,
165
+ "problem_type": null,
166
+ "return_dict": true,
167
+ "rms_norm_eps": 1e-06,
168
+ "rope_parameters": {
169
+ "rope_theta": 100.0,
170
+ "rope_type": "default"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  },
172
+ "standardize": true,
173
+ "use_clipped_linears": false
174
+ },
175
+ "vision_soft_tokens_per_image": 280
176
+ }