lthn commited on
Commit
a807aa6
·
verified ·
1 Parent(s): f1428aa

fix(config): sync with upstream google/gemma-4-E2B-it day-1 fixes (b446025, b4a6011)

Browse files

Aligns lemer config files with two upstream Google fixes:
- b446025 (#13): prefix-preserving tool calls + dialog compliance
- b4a6011 (#15): response schema update

Changes:
- config.json: restore vision_config block; clean eos_token_id from [1, 106, 50] to [1, 106]

Preserves lemer-specific tokenizer_config.json (model_specific_special_tokens were embedded into the model during LEK training and must remain).

Files changed (1) hide show
  1. config.json +182 -1034
config.json CHANGED
@@ -1,1042 +1,190 @@
1
  {
2
- "architectures": [
3
- "Gemma4ForConditionalGeneration"
4
- ],
5
- "audio_config": {
6
- "_name_or_path": "",
7
- "architectures": null,
8
- "attention_chunk_size": 12,
9
- "attention_context_left": 13,
10
- "attention_context_right": 0,
11
- "attention_invalid_logits_value": -1000000000.0,
12
- "attention_logit_cap": 50.0,
13
- "chunk_size_feed_forward": 0,
14
- "conv_kernel_size": 5,
15
- "dtype": "bfloat16",
16
- "gradient_clipping": 10000000000.0,
17
- "hidden_act": "silu",
18
- "hidden_size": 1024,
19
- "id2label": {
20
- "0": "LABEL_0",
21
- "1": "LABEL_1"
22
- },
23
- "initializer_range": 0.02,
24
- "is_encoder_decoder": false,
25
- "label2id": {
26
- "LABEL_0": 0,
27
- "LABEL_1": 1
28
- },
29
- "model_type": "gemma4_audio",
30
- "num_attention_heads": 8,
31
- "num_hidden_layers": 12,
32
- "output_attentions": false,
33
- "output_hidden_states": false,
34
- "output_proj_dims": 1536,
35
- "problem_type": null,
36
- "residual_weight": 0.5,
37
- "return_dict": true,
38
- "rms_norm_eps": 1e-06,
39
- "subsampling_conv_channels": [
40
- 128,
41
- 32
42
- ],
43
- "use_clipped_linears": true
44
- },
45
- "audio_token_id": 258881,
46
- "boa_token_id": 256000,
47
- "boi_token_id": 255999,
48
  "dtype": "bfloat16",
49
- "eoa_token_id": 258883,
50
- "eoa_token_index": 258883,
51
- "eoi_token_id": 258882,
52
- "eos_token_id": [
53
- 1,
54
- 106,
55
- 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  ],
57
- "image_token_id": 258880,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  "initializer_range": 0.02,
59
- "model_type": "gemma4",
60
- "quantization": {
61
- "group_size": 64,
62
- "bits": 4,
63
- "mode": "affine",
64
- "language_model.model.layers.0.mlp.gate_proj": {
65
- "group_size": 64,
66
- "bits": 8
67
- },
68
- "language_model.model.layers.0.mlp.down_proj": {
69
- "group_size": 64,
70
- "bits": 8
71
- },
72
- "language_model.model.layers.0.mlp.up_proj": {
73
- "group_size": 64,
74
- "bits": 8
75
- },
76
- "language_model.model.layers.1.mlp.gate_proj": {
77
- "group_size": 64,
78
- "bits": 8
79
- },
80
- "language_model.model.layers.1.mlp.down_proj": {
81
- "group_size": 64,
82
- "bits": 8
83
- },
84
- "language_model.model.layers.1.mlp.up_proj": {
85
- "group_size": 64,
86
- "bits": 8
87
- },
88
- "language_model.model.layers.2.mlp.gate_proj": {
89
- "group_size": 64,
90
- "bits": 8
91
- },
92
- "language_model.model.layers.2.mlp.down_proj": {
93
- "group_size": 64,
94
- "bits": 8
95
- },
96
- "language_model.model.layers.2.mlp.up_proj": {
97
- "group_size": 64,
98
- "bits": 8
99
- },
100
- "language_model.model.layers.3.mlp.gate_proj": {
101
- "group_size": 64,
102
- "bits": 8
103
- },
104
- "language_model.model.layers.3.mlp.down_proj": {
105
- "group_size": 64,
106
- "bits": 8
107
- },
108
- "language_model.model.layers.3.mlp.up_proj": {
109
- "group_size": 64,
110
- "bits": 8
111
- },
112
- "language_model.model.layers.4.mlp.gate_proj": {
113
- "group_size": 64,
114
- "bits": 8
115
- },
116
- "language_model.model.layers.4.mlp.down_proj": {
117
- "group_size": 64,
118
- "bits": 8
119
- },
120
- "language_model.model.layers.4.mlp.up_proj": {
121
- "group_size": 64,
122
- "bits": 8
123
- },
124
- "language_model.model.layers.5.mlp.gate_proj": {
125
- "group_size": 64,
126
- "bits": 8
127
- },
128
- "language_model.model.layers.5.mlp.down_proj": {
129
- "group_size": 64,
130
- "bits": 8
131
- },
132
- "language_model.model.layers.5.mlp.up_proj": {
133
- "group_size": 64,
134
- "bits": 8
135
- },
136
- "language_model.model.layers.6.mlp.gate_proj": {
137
- "group_size": 64,
138
- "bits": 8
139
- },
140
- "language_model.model.layers.6.mlp.down_proj": {
141
- "group_size": 64,
142
- "bits": 8
143
- },
144
- "language_model.model.layers.6.mlp.up_proj": {
145
- "group_size": 64,
146
- "bits": 8
147
- },
148
- "language_model.model.layers.7.mlp.gate_proj": {
149
- "group_size": 64,
150
- "bits": 8
151
- },
152
- "language_model.model.layers.7.mlp.down_proj": {
153
- "group_size": 64,
154
- "bits": 8
155
- },
156
- "language_model.model.layers.7.mlp.up_proj": {
157
- "group_size": 64,
158
- "bits": 8
159
- },
160
- "language_model.model.layers.8.mlp.gate_proj": {
161
- "group_size": 64,
162
- "bits": 8
163
- },
164
- "language_model.model.layers.8.mlp.down_proj": {
165
- "group_size": 64,
166
- "bits": 8
167
- },
168
- "language_model.model.layers.8.mlp.up_proj": {
169
- "group_size": 64,
170
- "bits": 8
171
- },
172
- "language_model.model.layers.9.mlp.gate_proj": {
173
- "group_size": 64,
174
- "bits": 8
175
- },
176
- "language_model.model.layers.9.mlp.down_proj": {
177
- "group_size": 64,
178
- "bits": 8
179
- },
180
- "language_model.model.layers.9.mlp.up_proj": {
181
- "group_size": 64,
182
- "bits": 8
183
- },
184
- "language_model.model.layers.10.mlp.gate_proj": {
185
- "group_size": 64,
186
- "bits": 8
187
- },
188
- "language_model.model.layers.10.mlp.down_proj": {
189
- "group_size": 64,
190
- "bits": 8
191
- },
192
- "language_model.model.layers.10.mlp.up_proj": {
193
- "group_size": 64,
194
- "bits": 8
195
- },
196
- "language_model.model.layers.11.mlp.gate_proj": {
197
- "group_size": 64,
198
- "bits": 8
199
- },
200
- "language_model.model.layers.11.mlp.down_proj": {
201
- "group_size": 64,
202
- "bits": 8
203
- },
204
- "language_model.model.layers.11.mlp.up_proj": {
205
- "group_size": 64,
206
- "bits": 8
207
- },
208
- "language_model.model.layers.12.mlp.gate_proj": {
209
- "group_size": 64,
210
- "bits": 8
211
- },
212
- "language_model.model.layers.12.mlp.down_proj": {
213
- "group_size": 64,
214
- "bits": 8
215
- },
216
- "language_model.model.layers.12.mlp.up_proj": {
217
- "group_size": 64,
218
- "bits": 8
219
- },
220
- "language_model.model.layers.13.mlp.gate_proj": {
221
- "group_size": 64,
222
- "bits": 8
223
- },
224
- "language_model.model.layers.13.mlp.down_proj": {
225
- "group_size": 64,
226
- "bits": 8
227
- },
228
- "language_model.model.layers.13.mlp.up_proj": {
229
- "group_size": 64,
230
- "bits": 8
231
- },
232
- "language_model.model.layers.14.mlp.gate_proj": {
233
- "group_size": 64,
234
- "bits": 8
235
- },
236
- "language_model.model.layers.14.mlp.down_proj": {
237
- "group_size": 64,
238
- "bits": 8
239
- },
240
- "language_model.model.layers.14.mlp.up_proj": {
241
- "group_size": 64,
242
- "bits": 8
243
- },
244
- "language_model.model.layers.15.mlp.gate_proj": {
245
- "group_size": 64,
246
- "bits": 8
247
- },
248
- "language_model.model.layers.15.mlp.down_proj": {
249
- "group_size": 64,
250
- "bits": 8
251
- },
252
- "language_model.model.layers.15.mlp.up_proj": {
253
- "group_size": 64,
254
- "bits": 8
255
- },
256
- "language_model.model.layers.16.mlp.gate_proj": {
257
- "group_size": 64,
258
- "bits": 8
259
- },
260
- "language_model.model.layers.16.mlp.down_proj": {
261
- "group_size": 64,
262
- "bits": 8
263
- },
264
- "language_model.model.layers.16.mlp.up_proj": {
265
- "group_size": 64,
266
- "bits": 8
267
- },
268
- "language_model.model.layers.17.mlp.gate_proj": {
269
- "group_size": 64,
270
- "bits": 8
271
- },
272
- "language_model.model.layers.17.mlp.down_proj": {
273
- "group_size": 64,
274
- "bits": 8
275
- },
276
- "language_model.model.layers.17.mlp.up_proj": {
277
- "group_size": 64,
278
- "bits": 8
279
- },
280
- "language_model.model.layers.18.mlp.gate_proj": {
281
- "group_size": 64,
282
- "bits": 8
283
- },
284
- "language_model.model.layers.18.mlp.down_proj": {
285
- "group_size": 64,
286
- "bits": 8
287
- },
288
- "language_model.model.layers.18.mlp.up_proj": {
289
- "group_size": 64,
290
- "bits": 8
291
- },
292
- "language_model.model.layers.19.mlp.gate_proj": {
293
- "group_size": 64,
294
- "bits": 8
295
- },
296
- "language_model.model.layers.19.mlp.down_proj": {
297
- "group_size": 64,
298
- "bits": 8
299
- },
300
- "language_model.model.layers.19.mlp.up_proj": {
301
- "group_size": 64,
302
- "bits": 8
303
- },
304
- "language_model.model.layers.20.mlp.gate_proj": {
305
- "group_size": 64,
306
- "bits": 8
307
- },
308
- "language_model.model.layers.20.mlp.down_proj": {
309
- "group_size": 64,
310
- "bits": 8
311
- },
312
- "language_model.model.layers.20.mlp.up_proj": {
313
- "group_size": 64,
314
- "bits": 8
315
- },
316
- "language_model.model.layers.21.mlp.gate_proj": {
317
- "group_size": 64,
318
- "bits": 8
319
- },
320
- "language_model.model.layers.21.mlp.down_proj": {
321
- "group_size": 64,
322
- "bits": 8
323
- },
324
- "language_model.model.layers.21.mlp.up_proj": {
325
- "group_size": 64,
326
- "bits": 8
327
- },
328
- "language_model.model.layers.22.mlp.gate_proj": {
329
- "group_size": 64,
330
- "bits": 8
331
- },
332
- "language_model.model.layers.22.mlp.down_proj": {
333
- "group_size": 64,
334
- "bits": 8
335
- },
336
- "language_model.model.layers.22.mlp.up_proj": {
337
- "group_size": 64,
338
- "bits": 8
339
- },
340
- "language_model.model.layers.23.mlp.gate_proj": {
341
- "group_size": 64,
342
- "bits": 8
343
- },
344
- "language_model.model.layers.23.mlp.down_proj": {
345
- "group_size": 64,
346
- "bits": 8
347
- },
348
- "language_model.model.layers.23.mlp.up_proj": {
349
- "group_size": 64,
350
- "bits": 8
351
- },
352
- "language_model.model.layers.24.mlp.gate_proj": {
353
- "group_size": 64,
354
- "bits": 8
355
- },
356
- "language_model.model.layers.24.mlp.down_proj": {
357
- "group_size": 64,
358
- "bits": 8
359
- },
360
- "language_model.model.layers.24.mlp.up_proj": {
361
- "group_size": 64,
362
- "bits": 8
363
- },
364
- "language_model.model.layers.25.mlp.gate_proj": {
365
- "group_size": 64,
366
- "bits": 8
367
- },
368
- "language_model.model.layers.25.mlp.down_proj": {
369
- "group_size": 64,
370
- "bits": 8
371
- },
372
- "language_model.model.layers.25.mlp.up_proj": {
373
- "group_size": 64,
374
- "bits": 8
375
- },
376
- "language_model.model.layers.26.mlp.gate_proj": {
377
- "group_size": 64,
378
- "bits": 8
379
- },
380
- "language_model.model.layers.26.mlp.down_proj": {
381
- "group_size": 64,
382
- "bits": 8
383
- },
384
- "language_model.model.layers.26.mlp.up_proj": {
385
- "group_size": 64,
386
- "bits": 8
387
- },
388
- "language_model.model.layers.27.mlp.gate_proj": {
389
- "group_size": 64,
390
- "bits": 8
391
- },
392
- "language_model.model.layers.27.mlp.down_proj": {
393
- "group_size": 64,
394
- "bits": 8
395
- },
396
- "language_model.model.layers.27.mlp.up_proj": {
397
- "group_size": 64,
398
- "bits": 8
399
- },
400
- "language_model.model.layers.28.mlp.gate_proj": {
401
- "group_size": 64,
402
- "bits": 8
403
- },
404
- "language_model.model.layers.28.mlp.down_proj": {
405
- "group_size": 64,
406
- "bits": 8
407
- },
408
- "language_model.model.layers.28.mlp.up_proj": {
409
- "group_size": 64,
410
- "bits": 8
411
- },
412
- "language_model.model.layers.29.mlp.gate_proj": {
413
- "group_size": 64,
414
- "bits": 8
415
- },
416
- "language_model.model.layers.29.mlp.down_proj": {
417
- "group_size": 64,
418
- "bits": 8
419
- },
420
- "language_model.model.layers.29.mlp.up_proj": {
421
- "group_size": 64,
422
- "bits": 8
423
- },
424
- "language_model.model.layers.30.mlp.gate_proj": {
425
- "group_size": 64,
426
- "bits": 8
427
- },
428
- "language_model.model.layers.30.mlp.down_proj": {
429
- "group_size": 64,
430
- "bits": 8
431
- },
432
- "language_model.model.layers.30.mlp.up_proj": {
433
- "group_size": 64,
434
- "bits": 8
435
- },
436
- "language_model.model.layers.31.mlp.gate_proj": {
437
- "group_size": 64,
438
- "bits": 8
439
- },
440
- "language_model.model.layers.31.mlp.down_proj": {
441
- "group_size": 64,
442
- "bits": 8
443
- },
444
- "language_model.model.layers.31.mlp.up_proj": {
445
- "group_size": 64,
446
- "bits": 8
447
- },
448
- "language_model.model.layers.32.mlp.gate_proj": {
449
- "group_size": 64,
450
- "bits": 8
451
- },
452
- "language_model.model.layers.32.mlp.down_proj": {
453
- "group_size": 64,
454
- "bits": 8
455
- },
456
- "language_model.model.layers.32.mlp.up_proj": {
457
- "group_size": 64,
458
- "bits": 8
459
- },
460
- "language_model.model.layers.33.mlp.gate_proj": {
461
- "group_size": 64,
462
- "bits": 8
463
- },
464
- "language_model.model.layers.33.mlp.down_proj": {
465
- "group_size": 64,
466
- "bits": 8
467
- },
468
- "language_model.model.layers.33.mlp.up_proj": {
469
- "group_size": 64,
470
- "bits": 8
471
- },
472
- "language_model.model.layers.34.mlp.gate_proj": {
473
- "group_size": 64,
474
- "bits": 8
475
- },
476
- "language_model.model.layers.34.mlp.down_proj": {
477
- "group_size": 64,
478
- "bits": 8
479
- },
480
- "language_model.model.layers.34.mlp.up_proj": {
481
- "group_size": 64,
482
- "bits": 8
483
- }
484
  },
485
- "quantization_config": {
486
- "group_size": 64,
487
- "bits": 4,
488
- "mode": "affine",
489
- "language_model.model.layers.0.mlp.gate_proj": {
490
- "group_size": 64,
491
- "bits": 8
492
- },
493
- "language_model.model.layers.0.mlp.down_proj": {
494
- "group_size": 64,
495
- "bits": 8
496
- },
497
- "language_model.model.layers.0.mlp.up_proj": {
498
- "group_size": 64,
499
- "bits": 8
500
- },
501
- "language_model.model.layers.1.mlp.gate_proj": {
502
- "group_size": 64,
503
- "bits": 8
504
- },
505
- "language_model.model.layers.1.mlp.down_proj": {
506
- "group_size": 64,
507
- "bits": 8
508
- },
509
- "language_model.model.layers.1.mlp.up_proj": {
510
- "group_size": 64,
511
- "bits": 8
512
- },
513
- "language_model.model.layers.2.mlp.gate_proj": {
514
- "group_size": 64,
515
- "bits": 8
516
- },
517
- "language_model.model.layers.2.mlp.down_proj": {
518
- "group_size": 64,
519
- "bits": 8
520
- },
521
- "language_model.model.layers.2.mlp.up_proj": {
522
- "group_size": 64,
523
- "bits": 8
524
- },
525
- "language_model.model.layers.3.mlp.gate_proj": {
526
- "group_size": 64,
527
- "bits": 8
528
- },
529
- "language_model.model.layers.3.mlp.down_proj": {
530
- "group_size": 64,
531
- "bits": 8
532
- },
533
- "language_model.model.layers.3.mlp.up_proj": {
534
- "group_size": 64,
535
- "bits": 8
536
- },
537
- "language_model.model.layers.4.mlp.gate_proj": {
538
- "group_size": 64,
539
- "bits": 8
540
- },
541
- "language_model.model.layers.4.mlp.down_proj": {
542
- "group_size": 64,
543
- "bits": 8
544
- },
545
- "language_model.model.layers.4.mlp.up_proj": {
546
- "group_size": 64,
547
- "bits": 8
548
- },
549
- "language_model.model.layers.5.mlp.gate_proj": {
550
- "group_size": 64,
551
- "bits": 8
552
- },
553
- "language_model.model.layers.5.mlp.down_proj": {
554
- "group_size": 64,
555
- "bits": 8
556
- },
557
- "language_model.model.layers.5.mlp.up_proj": {
558
- "group_size": 64,
559
- "bits": 8
560
- },
561
- "language_model.model.layers.6.mlp.gate_proj": {
562
- "group_size": 64,
563
- "bits": 8
564
- },
565
- "language_model.model.layers.6.mlp.down_proj": {
566
- "group_size": 64,
567
- "bits": 8
568
- },
569
- "language_model.model.layers.6.mlp.up_proj": {
570
- "group_size": 64,
571
- "bits": 8
572
- },
573
- "language_model.model.layers.7.mlp.gate_proj": {
574
- "group_size": 64,
575
- "bits": 8
576
- },
577
- "language_model.model.layers.7.mlp.down_proj": {
578
- "group_size": 64,
579
- "bits": 8
580
- },
581
- "language_model.model.layers.7.mlp.up_proj": {
582
- "group_size": 64,
583
- "bits": 8
584
- },
585
- "language_model.model.layers.8.mlp.gate_proj": {
586
- "group_size": 64,
587
- "bits": 8
588
- },
589
- "language_model.model.layers.8.mlp.down_proj": {
590
- "group_size": 64,
591
- "bits": 8
592
- },
593
- "language_model.model.layers.8.mlp.up_proj": {
594
- "group_size": 64,
595
- "bits": 8
596
- },
597
- "language_model.model.layers.9.mlp.gate_proj": {
598
- "group_size": 64,
599
- "bits": 8
600
- },
601
- "language_model.model.layers.9.mlp.down_proj": {
602
- "group_size": 64,
603
- "bits": 8
604
- },
605
- "language_model.model.layers.9.mlp.up_proj": {
606
- "group_size": 64,
607
- "bits": 8
608
- },
609
- "language_model.model.layers.10.mlp.gate_proj": {
610
- "group_size": 64,
611
- "bits": 8
612
- },
613
- "language_model.model.layers.10.mlp.down_proj": {
614
- "group_size": 64,
615
- "bits": 8
616
- },
617
- "language_model.model.layers.10.mlp.up_proj": {
618
- "group_size": 64,
619
- "bits": 8
620
- },
621
- "language_model.model.layers.11.mlp.gate_proj": {
622
- "group_size": 64,
623
- "bits": 8
624
- },
625
- "language_model.model.layers.11.mlp.down_proj": {
626
- "group_size": 64,
627
- "bits": 8
628
- },
629
- "language_model.model.layers.11.mlp.up_proj": {
630
- "group_size": 64,
631
- "bits": 8
632
- },
633
- "language_model.model.layers.12.mlp.gate_proj": {
634
- "group_size": 64,
635
- "bits": 8
636
- },
637
- "language_model.model.layers.12.mlp.down_proj": {
638
- "group_size": 64,
639
- "bits": 8
640
- },
641
- "language_model.model.layers.12.mlp.up_proj": {
642
- "group_size": 64,
643
- "bits": 8
644
- },
645
- "language_model.model.layers.13.mlp.gate_proj": {
646
- "group_size": 64,
647
- "bits": 8
648
- },
649
- "language_model.model.layers.13.mlp.down_proj": {
650
- "group_size": 64,
651
- "bits": 8
652
- },
653
- "language_model.model.layers.13.mlp.up_proj": {
654
- "group_size": 64,
655
- "bits": 8
656
- },
657
- "language_model.model.layers.14.mlp.gate_proj": {
658
- "group_size": 64,
659
- "bits": 8
660
- },
661
- "language_model.model.layers.14.mlp.down_proj": {
662
- "group_size": 64,
663
- "bits": 8
664
- },
665
- "language_model.model.layers.14.mlp.up_proj": {
666
- "group_size": 64,
667
- "bits": 8
668
- },
669
- "language_model.model.layers.15.mlp.gate_proj": {
670
- "group_size": 64,
671
- "bits": 8
672
- },
673
- "language_model.model.layers.15.mlp.down_proj": {
674
- "group_size": 64,
675
- "bits": 8
676
- },
677
- "language_model.model.layers.15.mlp.up_proj": {
678
- "group_size": 64,
679
- "bits": 8
680
- },
681
- "language_model.model.layers.16.mlp.gate_proj": {
682
- "group_size": 64,
683
- "bits": 8
684
- },
685
- "language_model.model.layers.16.mlp.down_proj": {
686
- "group_size": 64,
687
- "bits": 8
688
- },
689
- "language_model.model.layers.16.mlp.up_proj": {
690
- "group_size": 64,
691
- "bits": 8
692
- },
693
- "language_model.model.layers.17.mlp.gate_proj": {
694
- "group_size": 64,
695
- "bits": 8
696
- },
697
- "language_model.model.layers.17.mlp.down_proj": {
698
- "group_size": 64,
699
- "bits": 8
700
- },
701
- "language_model.model.layers.17.mlp.up_proj": {
702
- "group_size": 64,
703
- "bits": 8
704
- },
705
- "language_model.model.layers.18.mlp.gate_proj": {
706
- "group_size": 64,
707
- "bits": 8
708
- },
709
- "language_model.model.layers.18.mlp.down_proj": {
710
- "group_size": 64,
711
- "bits": 8
712
- },
713
- "language_model.model.layers.18.mlp.up_proj": {
714
- "group_size": 64,
715
- "bits": 8
716
- },
717
- "language_model.model.layers.19.mlp.gate_proj": {
718
- "group_size": 64,
719
- "bits": 8
720
- },
721
- "language_model.model.layers.19.mlp.down_proj": {
722
- "group_size": 64,
723
- "bits": 8
724
- },
725
- "language_model.model.layers.19.mlp.up_proj": {
726
- "group_size": 64,
727
- "bits": 8
728
- },
729
- "language_model.model.layers.20.mlp.gate_proj": {
730
- "group_size": 64,
731
- "bits": 8
732
- },
733
- "language_model.model.layers.20.mlp.down_proj": {
734
- "group_size": 64,
735
- "bits": 8
736
- },
737
- "language_model.model.layers.20.mlp.up_proj": {
738
- "group_size": 64,
739
- "bits": 8
740
- },
741
- "language_model.model.layers.21.mlp.gate_proj": {
742
- "group_size": 64,
743
- "bits": 8
744
- },
745
- "language_model.model.layers.21.mlp.down_proj": {
746
- "group_size": 64,
747
- "bits": 8
748
- },
749
- "language_model.model.layers.21.mlp.up_proj": {
750
- "group_size": 64,
751
- "bits": 8
752
- },
753
- "language_model.model.layers.22.mlp.gate_proj": {
754
- "group_size": 64,
755
- "bits": 8
756
- },
757
- "language_model.model.layers.22.mlp.down_proj": {
758
- "group_size": 64,
759
- "bits": 8
760
- },
761
- "language_model.model.layers.22.mlp.up_proj": {
762
- "group_size": 64,
763
- "bits": 8
764
- },
765
- "language_model.model.layers.23.mlp.gate_proj": {
766
- "group_size": 64,
767
- "bits": 8
768
- },
769
- "language_model.model.layers.23.mlp.down_proj": {
770
- "group_size": 64,
771
- "bits": 8
772
- },
773
- "language_model.model.layers.23.mlp.up_proj": {
774
- "group_size": 64,
775
- "bits": 8
776
- },
777
- "language_model.model.layers.24.mlp.gate_proj": {
778
- "group_size": 64,
779
- "bits": 8
780
- },
781
- "language_model.model.layers.24.mlp.down_proj": {
782
- "group_size": 64,
783
- "bits": 8
784
- },
785
- "language_model.model.layers.24.mlp.up_proj": {
786
- "group_size": 64,
787
- "bits": 8
788
- },
789
- "language_model.model.layers.25.mlp.gate_proj": {
790
- "group_size": 64,
791
- "bits": 8
792
- },
793
- "language_model.model.layers.25.mlp.down_proj": {
794
- "group_size": 64,
795
- "bits": 8
796
- },
797
- "language_model.model.layers.25.mlp.up_proj": {
798
- "group_size": 64,
799
- "bits": 8
800
- },
801
- "language_model.model.layers.26.mlp.gate_proj": {
802
- "group_size": 64,
803
- "bits": 8
804
- },
805
- "language_model.model.layers.26.mlp.down_proj": {
806
- "group_size": 64,
807
- "bits": 8
808
- },
809
- "language_model.model.layers.26.mlp.up_proj": {
810
- "group_size": 64,
811
- "bits": 8
812
- },
813
- "language_model.model.layers.27.mlp.gate_proj": {
814
- "group_size": 64,
815
- "bits": 8
816
- },
817
- "language_model.model.layers.27.mlp.down_proj": {
818
- "group_size": 64,
819
- "bits": 8
820
- },
821
- "language_model.model.layers.27.mlp.up_proj": {
822
- "group_size": 64,
823
- "bits": 8
824
- },
825
- "language_model.model.layers.28.mlp.gate_proj": {
826
- "group_size": 64,
827
- "bits": 8
828
- },
829
- "language_model.model.layers.28.mlp.down_proj": {
830
- "group_size": 64,
831
- "bits": 8
832
- },
833
- "language_model.model.layers.28.mlp.up_proj": {
834
- "group_size": 64,
835
- "bits": 8
836
- },
837
- "language_model.model.layers.29.mlp.gate_proj": {
838
- "group_size": 64,
839
- "bits": 8
840
- },
841
- "language_model.model.layers.29.mlp.down_proj": {
842
- "group_size": 64,
843
- "bits": 8
844
- },
845
- "language_model.model.layers.29.mlp.up_proj": {
846
- "group_size": 64,
847
- "bits": 8
848
- },
849
- "language_model.model.layers.30.mlp.gate_proj": {
850
- "group_size": 64,
851
- "bits": 8
852
- },
853
- "language_model.model.layers.30.mlp.down_proj": {
854
- "group_size": 64,
855
- "bits": 8
856
- },
857
- "language_model.model.layers.30.mlp.up_proj": {
858
- "group_size": 64,
859
- "bits": 8
860
- },
861
- "language_model.model.layers.31.mlp.gate_proj": {
862
- "group_size": 64,
863
- "bits": 8
864
- },
865
- "language_model.model.layers.31.mlp.down_proj": {
866
- "group_size": 64,
867
- "bits": 8
868
- },
869
- "language_model.model.layers.31.mlp.up_proj": {
870
- "group_size": 64,
871
- "bits": 8
872
- },
873
- "language_model.model.layers.32.mlp.gate_proj": {
874
- "group_size": 64,
875
- "bits": 8
876
- },
877
- "language_model.model.layers.32.mlp.down_proj": {
878
- "group_size": 64,
879
- "bits": 8
880
- },
881
- "language_model.model.layers.32.mlp.up_proj": {
882
- "group_size": 64,
883
- "bits": 8
884
- },
885
- "language_model.model.layers.33.mlp.gate_proj": {
886
- "group_size": 64,
887
- "bits": 8
888
- },
889
- "language_model.model.layers.33.mlp.down_proj": {
890
- "group_size": 64,
891
- "bits": 8
892
- },
893
- "language_model.model.layers.33.mlp.up_proj": {
894
- "group_size": 64,
895
- "bits": 8
896
- },
897
- "language_model.model.layers.34.mlp.gate_proj": {
898
- "group_size": 64,
899
- "bits": 8
900
- },
901
- "language_model.model.layers.34.mlp.down_proj": {
902
- "group_size": 64,
903
- "bits": 8
904
- },
905
- "language_model.model.layers.34.mlp.up_proj": {
906
- "group_size": 64,
907
- "bits": 8
908
- }
909
  },
910
- "text_config": {
911
- "attention_bias": false,
912
- "attention_dropout": 0.0,
913
- "attention_k_eq_v": false,
914
- "bos_token_id": 2,
915
- "dtype": "bfloat16",
916
- "enable_moe_block": false,
917
- "eos_token_id": 1,
918
- "expert_intermediate_size": null,
919
- "final_logit_softcapping": 30.0,
920
- "global_head_dim": 512,
921
- "head_dim": 256,
922
- "hidden_activation": "gelu_pytorch_tanh",
923
- "hidden_size": 1536,
924
- "hidden_size_per_layer_input": 256,
925
- "initializer_range": 0.02,
926
- "intermediate_size": 6144,
927
- "layer_types": [
928
- "sliding_attention",
929
- "sliding_attention",
930
- "sliding_attention",
931
- "sliding_attention",
932
- "full_attention",
933
- "sliding_attention",
934
- "sliding_attention",
935
- "sliding_attention",
936
- "sliding_attention",
937
- "full_attention",
938
- "sliding_attention",
939
- "sliding_attention",
940
- "sliding_attention",
941
- "sliding_attention",
942
- "full_attention",
943
- "sliding_attention",
944
- "sliding_attention",
945
- "sliding_attention",
946
- "sliding_attention",
947
- "full_attention",
948
- "sliding_attention",
949
- "sliding_attention",
950
- "sliding_attention",
951
- "sliding_attention",
952
- "full_attention",
953
- "sliding_attention",
954
- "sliding_attention",
955
- "sliding_attention",
956
- "sliding_attention",
957
- "full_attention",
958
- "sliding_attention",
959
- "sliding_attention",
960
- "sliding_attention",
961
- "sliding_attention",
962
- "full_attention"
963
- ],
964
- "max_position_embeddings": 131072,
965
- "model_type": "gemma4_text",
966
- "moe_intermediate_size": null,
967
- "num_attention_heads": 8,
968
- "num_experts": null,
969
- "num_global_key_value_heads": null,
970
- "num_hidden_layers": 35,
971
- "num_key_value_heads": 1,
972
- "num_kv_shared_layers": 20,
973
- "pad_token_id": 0,
974
- "rms_norm_eps": 1e-06,
975
- "rope_parameters": {
976
- "full_attention": {
977
- "partial_rotary_factor": 0.25,
978
- "rope_theta": 1000000.0,
979
- "rope_type": "proportional"
980
- },
981
- "sliding_attention": {
982
- "rope_theta": 10000.0,
983
- "rope_type": "default"
984
- }
985
- },
986
- "sliding_window": 512,
987
- "tie_word_embeddings": true,
988
- "top_k_experts": null,
989
- "use_bidirectional_attention": null,
990
- "use_cache": true,
991
- "use_double_wide_mlp": true,
992
- "vocab_size": 262144,
993
- "vocab_size_per_layer_input": 262144
994
  },
995
- "tie_word_embeddings": true,
996
- "transformers_version": "5.5.3",
997
- "video_token_id": 258884,
998
- "vision_config": {
999
- "_name_or_path": "",
1000
- "architectures": null,
1001
- "attention_bias": false,
1002
- "attention_dropout": 0.0,
1003
- "chunk_size_feed_forward": 0,
1004
- "default_output_length": 280,
1005
- "dtype": "bfloat16",
1006
- "global_head_dim": 64,
1007
- "head_dim": 64,
1008
- "hidden_activation": "gelu_pytorch_tanh",
1009
- "hidden_size": 768,
1010
- "id2label": {
1011
- "0": "LABEL_0",
1012
- "1": "LABEL_1"
1013
- },
1014
- "initializer_range": 0.02,
1015
- "intermediate_size": 3072,
1016
- "is_encoder_decoder": false,
1017
- "label2id": {
1018
- "LABEL_0": 0,
1019
- "LABEL_1": 1
1020
- },
1021
- "max_position_embeddings": 131072,
1022
- "model_type": "gemma4_vision",
1023
- "num_attention_heads": 12,
1024
- "num_hidden_layers": 16,
1025
- "num_key_value_heads": 12,
1026
- "output_attentions": false,
1027
- "output_hidden_states": false,
1028
- "patch_size": 16,
1029
- "pooling_kernel_size": 3,
1030
- "position_embedding_size": 10240,
1031
- "problem_type": null,
1032
- "return_dict": true,
1033
- "rms_norm_eps": 1e-06,
1034
- "rope_parameters": {
1035
- "rope_theta": 100.0,
1036
- "rope_type": "default"
1037
- },
1038
- "standardize": false,
1039
- "use_clipped_linears": true
1040
  },
1041
- "vision_soft_tokens_per_image": 280
1042
- }
 
 
 
 
1
  {
2
+ "architectures": [
3
+ "Gemma4ForConditionalGeneration"
4
+ ],
5
+ "audio_config": {
6
+ "_name_or_path": "",
7
+ "architectures": null,
8
+ "attention_chunk_size": 12,
9
+ "attention_context_left": 13,
10
+ "attention_context_right": 0,
11
+ "attention_invalid_logits_value": -1000000000.0,
12
+ "attention_logit_cap": 50.0,
13
+ "chunk_size_feed_forward": 0,
14
+ "conv_kernel_size": 5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "dtype": "bfloat16",
16
+ "gradient_clipping": 10000000000.0,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 1024,
19
+ "id2label": {
20
+ "0": "LABEL_0",
21
+ "1": "LABEL_1"
22
+ },
23
+ "initializer_range": 0.02,
24
+ "is_encoder_decoder": false,
25
+ "label2id": {
26
+ "LABEL_0": 0,
27
+ "LABEL_1": 1
28
+ },
29
+ "model_type": "gemma4_audio",
30
+ "num_attention_heads": 8,
31
+ "num_hidden_layers": 12,
32
+ "output_attentions": false,
33
+ "output_hidden_states": false,
34
+ "output_proj_dims": 1536,
35
+ "problem_type": null,
36
+ "residual_weight": 0.5,
37
+ "return_dict": true,
38
+ "rms_norm_eps": 1e-06,
39
+ "subsampling_conv_channels": [
40
+ 128,
41
+ 32
42
  ],
43
+ "use_clipped_linears": true
44
+ },
45
+ "audio_token_id": 258881,
46
+ "boa_token_id": 256000,
47
+ "boi_token_id": 255999,
48
+ "dtype": "bfloat16",
49
+ "eoa_token_id": 258883,
50
+ "eoa_token_index": 258883,
51
+ "eoi_token_id": 258882,
52
+ "eos_token_id": [
53
+ 1,
54
+ 106
55
+ ],
56
+ "image_token_id": 258880,
57
+ "initializer_range": 0.02,
58
+ "model_type": "gemma4",
59
+ "text_config": {
60
+ "attention_bias": false,
61
+ "attention_dropout": 0.0,
62
+ "attention_k_eq_v": false,
63
+ "bos_token_id": 2,
64
+ "dtype": "bfloat16",
65
+ "enable_moe_block": false,
66
+ "eos_token_id": 1,
67
+ "expert_intermediate_size": null,
68
+ "final_logit_softcapping": 30.0,
69
+ "global_head_dim": 512,
70
+ "head_dim": 256,
71
+ "hidden_activation": "gelu_pytorch_tanh",
72
+ "hidden_size": 1536,
73
+ "hidden_size_per_layer_input": 256,
74
  "initializer_range": 0.02,
75
+ "intermediate_size": 6144,
76
+ "layer_types": [
77
+ "sliding_attention",
78
+ "sliding_attention",
79
+ "sliding_attention",
80
+ "sliding_attention",
81
+ "full_attention",
82
+ "sliding_attention",
83
+ "sliding_attention",
84
+ "sliding_attention",
85
+ "sliding_attention",
86
+ "full_attention",
87
+ "sliding_attention",
88
+ "sliding_attention",
89
+ "sliding_attention",
90
+ "sliding_attention",
91
+ "full_attention",
92
+ "sliding_attention",
93
+ "sliding_attention",
94
+ "sliding_attention",
95
+ "sliding_attention",
96
+ "full_attention",
97
+ "sliding_attention",
98
+ "sliding_attention",
99
+ "sliding_attention",
100
+ "sliding_attention",
101
+ "full_attention",
102
+ "sliding_attention",
103
+ "sliding_attention",
104
+ "sliding_attention",
105
+ "sliding_attention",
106
+ "full_attention",
107
+ "sliding_attention",
108
+ "sliding_attention",
109
+ "sliding_attention",
110
+ "sliding_attention",
111
+ "full_attention"
112
+ ],
113
+ "max_position_embeddings": 131072,
114
+ "model_type": "gemma4_text",
115
+ "num_attention_heads": 8,
116
+ "num_experts": null,
117
+ "num_global_key_value_heads": null,
118
+ "num_hidden_layers": 35,
119
+ "num_key_value_heads": 1,
120
+ "num_kv_shared_layers": 20,
121
+ "pad_token_id": 0,
122
+ "rms_norm_eps": 1e-06,
123
+ "rope_parameters": {
124
+ "full_attention": {
125
+ "partial_rotary_factor": 0.25,
126
+ "rope_theta": 1000000.0,
127
+ "rope_type": "proportional"
128
+ },
129
+ "sliding_attention": {
130
+ "rope_theta": 10000.0,
131
+ "rope_type": "default"
132
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  },
134
+ "sliding_window": 512,
135
+ "tie_word_embeddings": true,
136
+ "top_k_experts": null,
137
+ "use_bidirectional_attention": null,
138
+ "use_cache": true,
139
+ "use_double_wide_mlp": true,
140
+ "vocab_size": 262144,
141
+ "vocab_size_per_layer_input": 262144
142
+ },
143
+ "tie_word_embeddings": true,
144
+ "transformers_version": "5.5.0.dev0",
145
+ "video_token_id": 258884,
146
+ "vision_config": {
147
+ "_name_or_path": "",
148
+ "architectures": null,
149
+ "attention_bias": false,
150
+ "attention_dropout": 0.0,
151
+ "chunk_size_feed_forward": 0,
152
+ "default_output_length": 280,
153
+ "dtype": "bfloat16",
154
+ "global_head_dim": 64,
155
+ "head_dim": 64,
156
+ "hidden_activation": "gelu_pytorch_tanh",
157
+ "hidden_size": 768,
158
+ "id2label": {
159
+ "0": "LABEL_0",
160
+ "1": "LABEL_1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  },
162
+ "initializer_range": 0.02,
163
+ "intermediate_size": 3072,
164
+ "is_encoder_decoder": false,
165
+ "label2id": {
166
+ "LABEL_0": 0,
167
+ "LABEL_1": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  },
169
+ "max_position_embeddings": 131072,
170
+ "model_type": "gemma4_vision",
171
+ "num_attention_heads": 12,
172
+ "num_hidden_layers": 16,
173
+ "num_key_value_heads": 12,
174
+ "output_attentions": false,
175
+ "output_hidden_states": false,
176
+ "patch_size": 16,
177
+ "pooling_kernel_size": 3,
178
+ "position_embedding_size": 10240,
179
+ "problem_type": null,
180
+ "return_dict": true,
181
+ "rms_norm_eps": 1e-06,
182
+ "rope_parameters": {
183
+ "rope_theta": 100.0,
184
+ "rope_type": "default"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  },
186
+ "standardize": false,
187
+ "use_clipped_linears": true
188
+ },
189
+ "vision_soft_tokens_per_image": 280
190
+ }