spicyneuron commited on
Commit
3c67fe6
·
verified ·
1 Parent(s): b8380e2

Add files using upload-large-folder tool

Browse files
config.json CHANGED
@@ -39,22 +39,22 @@
39
  "mode": "affine",
40
  "model.embed_tokens": {
41
  "group_size": 64,
42
- "bits": 4,
43
  "mode": "affine"
44
  },
45
  "model.layers.0.linear_attn.in_proj_qkvz": {
46
- "group_size": 32,
47
- "bits": 4,
48
- "mode": "mxfp4"
49
  },
50
  "model.layers.0.linear_attn.in_proj_ba": {
51
  "group_size": 64,
52
- "bits": 4,
53
  "mode": "affine"
54
  },
55
  "model.layers.0.linear_attn.out_proj": {
56
  "group_size": 64,
57
- "bits": 5,
58
  "mode": "affine"
59
  },
60
  "model.layers.0.mlp.switch_mlp.gate_proj": {
@@ -73,18 +73,18 @@
73
  "mode": "mxfp4"
74
  },
75
  "model.layers.1.linear_attn.in_proj_qkvz": {
76
- "group_size": 32,
77
- "bits": 4,
78
- "mode": "mxfp4"
79
  },
80
  "model.layers.1.linear_attn.in_proj_ba": {
81
  "group_size": 64,
82
- "bits": 4,
83
  "mode": "affine"
84
  },
85
  "model.layers.1.linear_attn.out_proj": {
86
  "group_size": 64,
87
- "bits": 5,
88
  "mode": "affine"
89
  },
90
  "model.layers.1.mlp.switch_mlp.gate_proj": {
@@ -98,23 +98,23 @@
98
  "mode": "mxfp4"
99
  },
100
  "model.layers.1.mlp.switch_mlp.down_proj": {
101
- "group_size": 64,
102
- "bits": 6,
103
- "mode": "affine"
104
- },
105
- "model.layers.2.linear_attn.in_proj_qkvz": {
106
  "group_size": 32,
107
  "bits": 4,
108
  "mode": "mxfp4"
109
  },
 
 
 
 
 
110
  "model.layers.2.linear_attn.in_proj_ba": {
111
  "group_size": 64,
112
- "bits": 4,
113
  "mode": "affine"
114
  },
115
  "model.layers.2.linear_attn.out_proj": {
116
  "group_size": 64,
117
- "bits": 5,
118
  "mode": "affine"
119
  },
120
  "model.layers.2.mlp.switch_mlp.gate_proj": {
@@ -133,24 +133,24 @@
133
  "mode": "mxfp4"
134
  },
135
  "model.layers.3.self_attn.q_proj": {
136
- "group_size": 32,
137
- "bits": 4,
138
- "mode": "mxfp4"
139
  },
140
  "model.layers.3.self_attn.k_proj": {
141
- "group_size": 32,
142
- "bits": 4,
143
- "mode": "mxfp4"
144
  },
145
  "model.layers.3.self_attn.v_proj": {
146
- "group_size": 32,
147
- "bits": 4,
148
- "mode": "mxfp4"
149
  },
150
  "model.layers.3.self_attn.o_proj": {
151
- "group_size": 32,
152
- "bits": 4,
153
- "mode": "mxfp4"
154
  },
155
  "model.layers.3.mlp.switch_mlp.gate_proj": {
156
  "group_size": 32,
@@ -168,18 +168,18 @@
168
  "mode": "mxfp4"
169
  },
170
  "model.layers.4.linear_attn.in_proj_qkvz": {
171
- "group_size": 32,
172
- "bits": 4,
173
- "mode": "mxfp4"
174
  },
175
  "model.layers.4.linear_attn.in_proj_ba": {
176
  "group_size": 64,
177
- "bits": 4,
178
  "mode": "affine"
179
  },
180
  "model.layers.4.linear_attn.out_proj": {
181
  "group_size": 64,
182
- "bits": 5,
183
  "mode": "affine"
184
  },
185
  "model.layers.4.mlp.switch_mlp.gate_proj": {
@@ -198,18 +198,18 @@
198
  "mode": "mxfp4"
199
  },
200
  "model.layers.5.linear_attn.in_proj_qkvz": {
201
- "group_size": 32,
202
- "bits": 4,
203
- "mode": "mxfp4"
204
  },
205
  "model.layers.5.linear_attn.in_proj_ba": {
206
  "group_size": 64,
207
- "bits": 4,
208
  "mode": "affine"
209
  },
210
  "model.layers.5.linear_attn.out_proj": {
211
  "group_size": 64,
212
- "bits": 5,
213
  "mode": "affine"
214
  },
215
  "model.layers.5.mlp.switch_mlp.gate_proj": {
@@ -228,18 +228,18 @@
228
  "mode": "mxfp4"
229
  },
230
  "model.layers.6.linear_attn.in_proj_qkvz": {
231
- "group_size": 32,
232
- "bits": 4,
233
- "mode": "mxfp4"
234
  },
235
  "model.layers.6.linear_attn.in_proj_ba": {
236
  "group_size": 64,
237
- "bits": 4,
238
  "mode": "affine"
239
  },
240
  "model.layers.6.linear_attn.out_proj": {
241
  "group_size": 64,
242
- "bits": 5,
243
  "mode": "affine"
244
  },
245
  "model.layers.6.mlp.switch_mlp.gate_proj": {
@@ -253,29 +253,29 @@
253
  "mode": "mxfp4"
254
  },
255
  "model.layers.6.mlp.switch_mlp.down_proj": {
256
- "group_size": 64,
257
- "bits": 6,
258
- "mode": "affine"
259
- },
260
- "model.layers.7.self_attn.q_proj": {
261
  "group_size": 32,
262
  "bits": 4,
263
  "mode": "mxfp4"
264
  },
 
 
 
 
 
265
  "model.layers.7.self_attn.k_proj": {
266
- "group_size": 32,
267
- "bits": 4,
268
- "mode": "mxfp4"
269
  },
270
  "model.layers.7.self_attn.v_proj": {
271
- "group_size": 32,
272
- "bits": 4,
273
- "mode": "mxfp4"
274
  },
275
  "model.layers.7.self_attn.o_proj": {
276
- "group_size": 32,
277
- "bits": 4,
278
- "mode": "mxfp4"
279
  },
280
  "model.layers.7.mlp.switch_mlp.gate_proj": {
281
  "group_size": 32,
@@ -293,18 +293,18 @@
293
  "mode": "mxfp4"
294
  },
295
  "model.layers.8.linear_attn.in_proj_qkvz": {
296
- "group_size": 32,
297
- "bits": 4,
298
- "mode": "mxfp4"
299
  },
300
  "model.layers.8.linear_attn.in_proj_ba": {
301
  "group_size": 64,
302
- "bits": 4,
303
  "mode": "affine"
304
  },
305
  "model.layers.8.linear_attn.out_proj": {
306
  "group_size": 64,
307
- "bits": 5,
308
  "mode": "affine"
309
  },
310
  "model.layers.8.mlp.switch_mlp.gate_proj": {
@@ -323,18 +323,18 @@
323
  "mode": "mxfp4"
324
  },
325
  "model.layers.9.linear_attn.in_proj_qkvz": {
326
- "group_size": 32,
327
- "bits": 4,
328
- "mode": "mxfp4"
329
  },
330
  "model.layers.9.linear_attn.in_proj_ba": {
331
  "group_size": 64,
332
- "bits": 4,
333
  "mode": "affine"
334
  },
335
  "model.layers.9.linear_attn.out_proj": {
336
  "group_size": 64,
337
- "bits": 5,
338
  "mode": "affine"
339
  },
340
  "model.layers.9.mlp.switch_mlp.gate_proj": {
@@ -353,18 +353,18 @@
353
  "mode": "mxfp4"
354
  },
355
  "model.layers.10.linear_attn.in_proj_qkvz": {
356
- "group_size": 32,
357
- "bits": 4,
358
- "mode": "mxfp4"
359
  },
360
  "model.layers.10.linear_attn.in_proj_ba": {
361
  "group_size": 64,
362
- "bits": 4,
363
  "mode": "affine"
364
  },
365
  "model.layers.10.linear_attn.out_proj": {
366
  "group_size": 64,
367
- "bits": 5,
368
  "mode": "affine"
369
  },
370
  "model.layers.10.mlp.switch_mlp.gate_proj": {
@@ -378,29 +378,29 @@
378
  "mode": "mxfp4"
379
  },
380
  "model.layers.10.mlp.switch_mlp.down_proj": {
381
- "group_size": 64,
382
- "bits": 6,
383
- "mode": "affine"
384
- },
385
- "model.layers.11.self_attn.q_proj": {
386
  "group_size": 32,
387
  "bits": 4,
388
  "mode": "mxfp4"
389
  },
 
 
 
 
 
390
  "model.layers.11.self_attn.k_proj": {
391
- "group_size": 32,
392
- "bits": 4,
393
- "mode": "mxfp4"
394
  },
395
  "model.layers.11.self_attn.v_proj": {
396
- "group_size": 32,
397
- "bits": 4,
398
- "mode": "mxfp4"
399
  },
400
  "model.layers.11.self_attn.o_proj": {
401
- "group_size": 32,
402
- "bits": 4,
403
- "mode": "mxfp4"
404
  },
405
  "model.layers.11.mlp.switch_mlp.gate_proj": {
406
  "group_size": 32,
@@ -418,18 +418,18 @@
418
  "mode": "mxfp4"
419
  },
420
  "model.layers.12.linear_attn.in_proj_qkvz": {
421
- "group_size": 32,
422
- "bits": 4,
423
- "mode": "mxfp4"
424
  },
425
  "model.layers.12.linear_attn.in_proj_ba": {
426
  "group_size": 64,
427
- "bits": 4,
428
  "mode": "affine"
429
  },
430
  "model.layers.12.linear_attn.out_proj": {
431
  "group_size": 64,
432
- "bits": 6,
433
  "mode": "affine"
434
  },
435
  "model.layers.12.mlp.switch_mlp.gate_proj": {
@@ -448,18 +448,18 @@
448
  "mode": "mxfp4"
449
  },
450
  "model.layers.13.linear_attn.in_proj_qkvz": {
451
- "group_size": 32,
452
- "bits": 4,
453
- "mode": "mxfp4"
454
  },
455
  "model.layers.13.linear_attn.in_proj_ba": {
456
  "group_size": 64,
457
- "bits": 4,
458
  "mode": "affine"
459
  },
460
  "model.layers.13.linear_attn.out_proj": {
461
  "group_size": 64,
462
- "bits": 5,
463
  "mode": "affine"
464
  },
465
  "model.layers.13.mlp.switch_mlp.gate_proj": {
@@ -478,18 +478,18 @@
478
  "mode": "mxfp4"
479
  },
480
  "model.layers.14.linear_attn.in_proj_qkvz": {
481
- "group_size": 32,
482
- "bits": 4,
483
- "mode": "mxfp4"
484
  },
485
  "model.layers.14.linear_attn.in_proj_ba": {
486
  "group_size": 64,
487
- "bits": 4,
488
  "mode": "affine"
489
  },
490
  "model.layers.14.linear_attn.out_proj": {
491
  "group_size": 64,
492
- "bits": 5,
493
  "mode": "affine"
494
  },
495
  "model.layers.14.mlp.switch_mlp.gate_proj": {
@@ -503,29 +503,29 @@
503
  "mode": "mxfp4"
504
  },
505
  "model.layers.14.mlp.switch_mlp.down_proj": {
506
- "group_size": 64,
507
- "bits": 6,
508
- "mode": "affine"
509
- },
510
- "model.layers.15.self_attn.q_proj": {
511
  "group_size": 32,
512
  "bits": 4,
513
  "mode": "mxfp4"
514
  },
 
 
 
 
 
515
  "model.layers.15.self_attn.k_proj": {
516
- "group_size": 32,
517
- "bits": 4,
518
- "mode": "mxfp4"
519
  },
520
  "model.layers.15.self_attn.v_proj": {
521
- "group_size": 32,
522
- "bits": 4,
523
- "mode": "mxfp4"
524
  },
525
  "model.layers.15.self_attn.o_proj": {
526
- "group_size": 32,
527
- "bits": 4,
528
- "mode": "mxfp4"
529
  },
530
  "model.layers.15.mlp.switch_mlp.gate_proj": {
531
  "group_size": 32,
@@ -543,18 +543,18 @@
543
  "mode": "mxfp4"
544
  },
545
  "model.layers.16.linear_attn.in_proj_qkvz": {
546
- "group_size": 32,
547
- "bits": 4,
548
- "mode": "mxfp4"
549
  },
550
  "model.layers.16.linear_attn.in_proj_ba": {
551
  "group_size": 64,
552
- "bits": 4,
553
  "mode": "affine"
554
  },
555
  "model.layers.16.linear_attn.out_proj": {
556
  "group_size": 64,
557
- "bits": 6,
558
  "mode": "affine"
559
  },
560
  "model.layers.16.mlp.switch_mlp.gate_proj": {
@@ -573,18 +573,18 @@
573
  "mode": "mxfp4"
574
  },
575
  "model.layers.17.linear_attn.in_proj_qkvz": {
576
- "group_size": 32,
577
- "bits": 4,
578
- "mode": "mxfp4"
579
  },
580
  "model.layers.17.linear_attn.in_proj_ba": {
581
  "group_size": 64,
582
- "bits": 4,
583
  "mode": "affine"
584
  },
585
  "model.layers.17.linear_attn.out_proj": {
586
  "group_size": 64,
587
- "bits": 6,
588
  "mode": "affine"
589
  },
590
  "model.layers.17.mlp.switch_mlp.gate_proj": {
@@ -603,18 +603,18 @@
603
  "mode": "mxfp4"
604
  },
605
  "model.layers.18.linear_attn.in_proj_qkvz": {
606
- "group_size": 32,
607
- "bits": 4,
608
- "mode": "mxfp4"
609
  },
610
  "model.layers.18.linear_attn.in_proj_ba": {
611
  "group_size": 64,
612
- "bits": 4,
613
  "mode": "affine"
614
  },
615
  "model.layers.18.linear_attn.out_proj": {
616
  "group_size": 64,
617
- "bits": 5,
618
  "mode": "affine"
619
  },
620
  "model.layers.18.mlp.switch_mlp.gate_proj": {
@@ -633,24 +633,24 @@
633
  "mode": "mxfp4"
634
  },
635
  "model.layers.19.self_attn.q_proj": {
636
- "group_size": 32,
637
- "bits": 4,
638
- "mode": "mxfp4"
639
  },
640
  "model.layers.19.self_attn.k_proj": {
641
- "group_size": 32,
642
- "bits": 4,
643
- "mode": "mxfp4"
644
  },
645
  "model.layers.19.self_attn.v_proj": {
646
- "group_size": 32,
647
- "bits": 4,
648
- "mode": "mxfp4"
649
  },
650
  "model.layers.19.self_attn.o_proj": {
651
- "group_size": 32,
652
- "bits": 4,
653
- "mode": "mxfp4"
654
  },
655
  "model.layers.19.mlp.switch_mlp.gate_proj": {
656
  "group_size": 32,
@@ -668,18 +668,18 @@
668
  "mode": "mxfp4"
669
  },
670
  "model.layers.20.linear_attn.in_proj_qkvz": {
671
- "group_size": 32,
672
- "bits": 4,
673
- "mode": "mxfp4"
674
  },
675
  "model.layers.20.linear_attn.in_proj_ba": {
676
  "group_size": 64,
677
- "bits": 4,
678
  "mode": "affine"
679
  },
680
  "model.layers.20.linear_attn.out_proj": {
681
  "group_size": 64,
682
- "bits": 5,
683
  "mode": "affine"
684
  },
685
  "model.layers.20.mlp.switch_mlp.gate_proj": {
@@ -698,18 +698,18 @@
698
  "mode": "mxfp4"
699
  },
700
  "model.layers.21.linear_attn.in_proj_qkvz": {
701
- "group_size": 32,
702
- "bits": 4,
703
- "mode": "mxfp4"
704
  },
705
  "model.layers.21.linear_attn.in_proj_ba": {
706
  "group_size": 64,
707
- "bits": 4,
708
  "mode": "affine"
709
  },
710
  "model.layers.21.linear_attn.out_proj": {
711
  "group_size": 64,
712
- "bits": 5,
713
  "mode": "affine"
714
  },
715
  "model.layers.21.mlp.switch_mlp.gate_proj": {
@@ -728,18 +728,18 @@
728
  "mode": "mxfp4"
729
  },
730
  "model.layers.22.linear_attn.in_proj_qkvz": {
731
- "group_size": 32,
732
- "bits": 4,
733
- "mode": "mxfp4"
734
  },
735
  "model.layers.22.linear_attn.in_proj_ba": {
736
  "group_size": 64,
737
- "bits": 4,
738
  "mode": "affine"
739
  },
740
  "model.layers.22.linear_attn.out_proj": {
741
  "group_size": 64,
742
- "bits": 5,
743
  "mode": "affine"
744
  },
745
  "model.layers.22.mlp.switch_mlp.gate_proj": {
@@ -753,29 +753,29 @@
753
  "mode": "mxfp4"
754
  },
755
  "model.layers.22.mlp.switch_mlp.down_proj": {
756
- "group_size": 64,
757
- "bits": 6,
758
- "mode": "affine"
759
- },
760
- "model.layers.23.self_attn.q_proj": {
761
  "group_size": 32,
762
  "bits": 4,
763
  "mode": "mxfp4"
764
  },
 
 
 
 
 
765
  "model.layers.23.self_attn.k_proj": {
766
- "group_size": 32,
767
- "bits": 4,
768
- "mode": "mxfp4"
769
  },
770
  "model.layers.23.self_attn.v_proj": {
771
- "group_size": 32,
772
- "bits": 4,
773
- "mode": "mxfp4"
774
  },
775
  "model.layers.23.self_attn.o_proj": {
776
- "group_size": 32,
777
- "bits": 4,
778
- "mode": "mxfp4"
779
  },
780
  "model.layers.23.mlp.switch_mlp.gate_proj": {
781
  "group_size": 32,
@@ -793,18 +793,18 @@
793
  "mode": "mxfp4"
794
  },
795
  "model.layers.24.linear_attn.in_proj_qkvz": {
796
- "group_size": 32,
797
- "bits": 4,
798
- "mode": "mxfp4"
799
  },
800
  "model.layers.24.linear_attn.in_proj_ba": {
801
  "group_size": 64,
802
- "bits": 4,
803
  "mode": "affine"
804
  },
805
  "model.layers.24.linear_attn.out_proj": {
806
  "group_size": 64,
807
- "bits": 5,
808
  "mode": "affine"
809
  },
810
  "model.layers.24.mlp.switch_mlp.gate_proj": {
@@ -823,18 +823,18 @@
823
  "mode": "mxfp4"
824
  },
825
  "model.layers.25.linear_attn.in_proj_qkvz": {
826
- "group_size": 32,
827
- "bits": 4,
828
- "mode": "mxfp4"
829
  },
830
  "model.layers.25.linear_attn.in_proj_ba": {
831
  "group_size": 64,
832
- "bits": 4,
833
  "mode": "affine"
834
  },
835
  "model.layers.25.linear_attn.out_proj": {
836
  "group_size": 64,
837
- "bits": 5,
838
  "mode": "affine"
839
  },
840
  "model.layers.25.mlp.switch_mlp.gate_proj": {
@@ -853,18 +853,18 @@
853
  "mode": "mxfp4"
854
  },
855
  "model.layers.26.linear_attn.in_proj_qkvz": {
856
- "group_size": 32,
857
- "bits": 4,
858
- "mode": "mxfp4"
859
  },
860
  "model.layers.26.linear_attn.in_proj_ba": {
861
  "group_size": 64,
862
- "bits": 4,
863
  "mode": "affine"
864
  },
865
  "model.layers.26.linear_attn.out_proj": {
866
  "group_size": 64,
867
- "bits": 5,
868
  "mode": "affine"
869
  },
870
  "model.layers.26.mlp.switch_mlp.gate_proj": {
@@ -883,24 +883,24 @@
883
  "mode": "mxfp4"
884
  },
885
  "model.layers.27.self_attn.q_proj": {
886
- "group_size": 32,
887
- "bits": 4,
888
- "mode": "mxfp4"
889
  },
890
  "model.layers.27.self_attn.k_proj": {
891
- "group_size": 32,
892
- "bits": 4,
893
- "mode": "mxfp4"
894
  },
895
  "model.layers.27.self_attn.v_proj": {
896
- "group_size": 32,
897
- "bits": 4,
898
- "mode": "mxfp4"
899
  },
900
  "model.layers.27.self_attn.o_proj": {
901
- "group_size": 32,
902
- "bits": 4,
903
- "mode": "mxfp4"
904
  },
905
  "model.layers.27.mlp.switch_mlp.gate_proj": {
906
  "group_size": 32,
@@ -918,18 +918,18 @@
918
  "mode": "mxfp4"
919
  },
920
  "model.layers.28.linear_attn.in_proj_qkvz": {
921
- "group_size": 32,
922
- "bits": 4,
923
- "mode": "mxfp4"
924
  },
925
  "model.layers.28.linear_attn.in_proj_ba": {
926
  "group_size": 64,
927
- "bits": 4,
928
  "mode": "affine"
929
  },
930
  "model.layers.28.linear_attn.out_proj": {
931
  "group_size": 64,
932
- "bits": 5,
933
  "mode": "affine"
934
  },
935
  "model.layers.28.mlp.switch_mlp.gate_proj": {
@@ -948,18 +948,18 @@
948
  "mode": "mxfp4"
949
  },
950
  "model.layers.29.linear_attn.in_proj_qkvz": {
951
- "group_size": 32,
952
- "bits": 4,
953
- "mode": "mxfp4"
954
  },
955
  "model.layers.29.linear_attn.in_proj_ba": {
956
  "group_size": 64,
957
- "bits": 4,
958
  "mode": "affine"
959
  },
960
  "model.layers.29.linear_attn.out_proj": {
961
  "group_size": 64,
962
- "bits": 5,
963
  "mode": "affine"
964
  },
965
  "model.layers.29.mlp.switch_mlp.gate_proj": {
@@ -978,18 +978,18 @@
978
  "mode": "mxfp4"
979
  },
980
  "model.layers.30.linear_attn.in_proj_qkvz": {
981
- "group_size": 32,
982
- "bits": 4,
983
- "mode": "mxfp4"
984
  },
985
  "model.layers.30.linear_attn.in_proj_ba": {
986
  "group_size": 64,
987
- "bits": 4,
988
  "mode": "affine"
989
  },
990
  "model.layers.30.linear_attn.out_proj": {
991
  "group_size": 64,
992
- "bits": 6,
993
  "mode": "affine"
994
  },
995
  "model.layers.30.mlp.switch_mlp.gate_proj": {
@@ -1008,24 +1008,24 @@
1008
  "mode": "mxfp4"
1009
  },
1010
  "model.layers.31.self_attn.q_proj": {
1011
- "group_size": 32,
1012
- "bits": 4,
1013
- "mode": "mxfp4"
1014
  },
1015
  "model.layers.31.self_attn.k_proj": {
1016
- "group_size": 32,
1017
- "bits": 4,
1018
- "mode": "mxfp4"
1019
  },
1020
  "model.layers.31.self_attn.v_proj": {
1021
- "group_size": 32,
1022
- "bits": 4,
1023
- "mode": "mxfp4"
1024
  },
1025
  "model.layers.31.self_attn.o_proj": {
1026
- "group_size": 32,
1027
- "bits": 4,
1028
- "mode": "mxfp4"
1029
  },
1030
  "model.layers.31.mlp.switch_mlp.gate_proj": {
1031
  "group_size": 32,
@@ -1038,23 +1038,23 @@
1038
  "mode": "mxfp4"
1039
  },
1040
  "model.layers.31.mlp.switch_mlp.down_proj": {
1041
- "group_size": 64,
1042
- "bits": 6,
1043
- "mode": "affine"
1044
- },
1045
- "model.layers.32.linear_attn.in_proj_qkvz": {
1046
  "group_size": 32,
1047
  "bits": 4,
1048
  "mode": "mxfp4"
1049
  },
 
 
 
 
 
1050
  "model.layers.32.linear_attn.in_proj_ba": {
1051
  "group_size": 64,
1052
- "bits": 4,
1053
  "mode": "affine"
1054
  },
1055
  "model.layers.32.linear_attn.out_proj": {
1056
  "group_size": 64,
1057
- "bits": 6,
1058
  "mode": "affine"
1059
  },
1060
  "model.layers.32.mlp.switch_mlp.gate_proj": {
@@ -1073,18 +1073,18 @@
1073
  "mode": "mxfp4"
1074
  },
1075
  "model.layers.33.linear_attn.in_proj_qkvz": {
1076
- "group_size": 32,
1077
- "bits": 4,
1078
- "mode": "mxfp4"
1079
  },
1080
  "model.layers.33.linear_attn.in_proj_ba": {
1081
  "group_size": 64,
1082
- "bits": 4,
1083
  "mode": "affine"
1084
  },
1085
  "model.layers.33.linear_attn.out_proj": {
1086
  "group_size": 64,
1087
- "bits": 5,
1088
  "mode": "affine"
1089
  },
1090
  "model.layers.33.mlp.switch_mlp.gate_proj": {
@@ -1103,18 +1103,18 @@
1103
  "mode": "mxfp4"
1104
  },
1105
  "model.layers.34.linear_attn.in_proj_qkvz": {
1106
- "group_size": 32,
1107
- "bits": 4,
1108
- "mode": "mxfp4"
1109
  },
1110
  "model.layers.34.linear_attn.in_proj_ba": {
1111
  "group_size": 64,
1112
- "bits": 4,
1113
  "mode": "affine"
1114
  },
1115
  "model.layers.34.linear_attn.out_proj": {
1116
  "group_size": 64,
1117
- "bits": 6,
1118
  "mode": "affine"
1119
  },
1120
  "model.layers.34.mlp.switch_mlp.gate_proj": {
@@ -1133,24 +1133,24 @@
1133
  "mode": "mxfp4"
1134
  },
1135
  "model.layers.35.self_attn.q_proj": {
1136
- "group_size": 32,
1137
- "bits": 4,
1138
- "mode": "mxfp4"
1139
  },
1140
  "model.layers.35.self_attn.k_proj": {
1141
- "group_size": 32,
1142
- "bits": 4,
1143
- "mode": "mxfp4"
1144
  },
1145
  "model.layers.35.self_attn.v_proj": {
1146
- "group_size": 32,
1147
- "bits": 4,
1148
- "mode": "mxfp4"
1149
  },
1150
  "model.layers.35.self_attn.o_proj": {
1151
- "group_size": 32,
1152
- "bits": 4,
1153
- "mode": "mxfp4"
1154
  },
1155
  "model.layers.35.mlp.switch_mlp.gate_proj": {
1156
  "group_size": 32,
@@ -1168,18 +1168,18 @@
1168
  "mode": "mxfp4"
1169
  },
1170
  "model.layers.36.linear_attn.in_proj_qkvz": {
1171
- "group_size": 32,
1172
- "bits": 4,
1173
- "mode": "mxfp4"
1174
  },
1175
  "model.layers.36.linear_attn.in_proj_ba": {
1176
  "group_size": 64,
1177
- "bits": 4,
1178
  "mode": "affine"
1179
  },
1180
  "model.layers.36.linear_attn.out_proj": {
1181
  "group_size": 64,
1182
- "bits": 5,
1183
  "mode": "affine"
1184
  },
1185
  "model.layers.36.mlp.switch_mlp.gate_proj": {
@@ -1198,18 +1198,18 @@
1198
  "mode": "mxfp4"
1199
  },
1200
  "model.layers.37.linear_attn.in_proj_qkvz": {
1201
- "group_size": 32,
1202
- "bits": 4,
1203
- "mode": "mxfp4"
1204
  },
1205
  "model.layers.37.linear_attn.in_proj_ba": {
1206
  "group_size": 64,
1207
- "bits": 4,
1208
  "mode": "affine"
1209
  },
1210
  "model.layers.37.linear_attn.out_proj": {
1211
  "group_size": 64,
1212
- "bits": 5,
1213
  "mode": "affine"
1214
  },
1215
  "model.layers.37.mlp.switch_mlp.gate_proj": {
@@ -1228,18 +1228,18 @@
1228
  "mode": "mxfp4"
1229
  },
1230
  "model.layers.38.linear_attn.in_proj_qkvz": {
1231
- "group_size": 32,
1232
- "bits": 4,
1233
- "mode": "mxfp4"
1234
  },
1235
  "model.layers.38.linear_attn.in_proj_ba": {
1236
  "group_size": 64,
1237
- "bits": 4,
1238
  "mode": "affine"
1239
  },
1240
  "model.layers.38.linear_attn.out_proj": {
1241
  "group_size": 64,
1242
- "bits": 5,
1243
  "mode": "affine"
1244
  },
1245
  "model.layers.38.mlp.switch_mlp.gate_proj": {
@@ -1258,24 +1258,24 @@
1258
  "mode": "mxfp4"
1259
  },
1260
  "model.layers.39.self_attn.q_proj": {
1261
- "group_size": 32,
1262
- "bits": 4,
1263
- "mode": "mxfp4"
1264
  },
1265
  "model.layers.39.self_attn.k_proj": {
1266
- "group_size": 32,
1267
- "bits": 4,
1268
- "mode": "mxfp4"
1269
  },
1270
  "model.layers.39.self_attn.v_proj": {
1271
- "group_size": 32,
1272
- "bits": 4,
1273
- "mode": "mxfp4"
1274
  },
1275
  "model.layers.39.self_attn.o_proj": {
1276
- "group_size": 32,
1277
- "bits": 4,
1278
- "mode": "mxfp4"
1279
  },
1280
  "model.layers.39.mlp.switch_mlp.gate_proj": {
1281
  "group_size": 32,
@@ -1288,23 +1288,23 @@
1288
  "mode": "mxfp4"
1289
  },
1290
  "model.layers.39.mlp.switch_mlp.down_proj": {
1291
- "group_size": 64,
1292
- "bits": 6,
1293
- "mode": "affine"
1294
- },
1295
- "model.layers.40.linear_attn.in_proj_qkvz": {
1296
  "group_size": 32,
1297
  "bits": 4,
1298
  "mode": "mxfp4"
1299
  },
 
 
 
 
 
1300
  "model.layers.40.linear_attn.in_proj_ba": {
1301
  "group_size": 64,
1302
- "bits": 4,
1303
  "mode": "affine"
1304
  },
1305
  "model.layers.40.linear_attn.out_proj": {
1306
  "group_size": 64,
1307
- "bits": 5,
1308
  "mode": "affine"
1309
  },
1310
  "model.layers.40.mlp.switch_mlp.gate_proj": {
@@ -1323,18 +1323,18 @@
1323
  "mode": "mxfp4"
1324
  },
1325
  "model.layers.41.linear_attn.in_proj_qkvz": {
1326
- "group_size": 32,
1327
- "bits": 4,
1328
- "mode": "mxfp4"
1329
  },
1330
  "model.layers.41.linear_attn.in_proj_ba": {
1331
  "group_size": 64,
1332
- "bits": 4,
1333
  "mode": "affine"
1334
  },
1335
  "model.layers.41.linear_attn.out_proj": {
1336
  "group_size": 64,
1337
- "bits": 5,
1338
  "mode": "affine"
1339
  },
1340
  "model.layers.41.mlp.switch_mlp.gate_proj": {
@@ -1353,18 +1353,18 @@
1353
  "mode": "mxfp4"
1354
  },
1355
  "model.layers.42.linear_attn.in_proj_qkvz": {
1356
- "group_size": 32,
1357
- "bits": 4,
1358
- "mode": "mxfp4"
1359
  },
1360
  "model.layers.42.linear_attn.in_proj_ba": {
1361
  "group_size": 64,
1362
- "bits": 4,
1363
  "mode": "affine"
1364
  },
1365
  "model.layers.42.linear_attn.out_proj": {
1366
  "group_size": 64,
1367
- "bits": 5,
1368
  "mode": "affine"
1369
  },
1370
  "model.layers.42.mlp.switch_mlp.gate_proj": {
@@ -1378,28 +1378,28 @@
1378
  "mode": "mxfp4"
1379
  },
1380
  "model.layers.42.mlp.switch_mlp.down_proj": {
1381
- "group_size": 64,
1382
- "bits": 6,
1383
- "mode": "affine"
1384
  },
1385
  "model.layers.43.self_attn.q_proj": {
1386
  "group_size": 64,
1387
- "bits": 5,
1388
  "mode": "affine"
1389
  },
1390
  "model.layers.43.self_attn.k_proj": {
1391
  "group_size": 64,
1392
- "bits": 5,
1393
  "mode": "affine"
1394
  },
1395
  "model.layers.43.self_attn.v_proj": {
1396
  "group_size": 64,
1397
- "bits": 5,
1398
  "mode": "affine"
1399
  },
1400
  "model.layers.43.self_attn.o_proj": {
1401
  "group_size": 64,
1402
- "bits": 5,
1403
  "mode": "affine"
1404
  },
1405
  "model.layers.43.mlp.switch_mlp.gate_proj": {
@@ -1413,23 +1413,23 @@
1413
  "mode": "mxfp4"
1414
  },
1415
  "model.layers.43.mlp.switch_mlp.down_proj": {
1416
- "group_size": 64,
1417
- "bits": 6,
1418
- "mode": "affine"
1419
- },
1420
- "model.layers.44.linear_attn.in_proj_qkvz": {
1421
  "group_size": 32,
1422
  "bits": 4,
1423
  "mode": "mxfp4"
1424
  },
 
 
 
 
 
1425
  "model.layers.44.linear_attn.in_proj_ba": {
1426
  "group_size": 64,
1427
- "bits": 4,
1428
  "mode": "affine"
1429
  },
1430
  "model.layers.44.linear_attn.out_proj": {
1431
  "group_size": 64,
1432
- "bits": 6,
1433
  "mode": "affine"
1434
  },
1435
  "model.layers.44.mlp.switch_mlp.gate_proj": {
@@ -1449,12 +1449,12 @@
1449
  },
1450
  "model.layers.45.linear_attn.in_proj_qkvz": {
1451
  "group_size": 64,
1452
- "bits": 5,
1453
  "mode": "affine"
1454
  },
1455
  "model.layers.45.linear_attn.in_proj_ba": {
1456
  "group_size": 64,
1457
- "bits": 4,
1458
  "mode": "affine"
1459
  },
1460
  "model.layers.45.linear_attn.out_proj": {
@@ -1473,18 +1473,18 @@
1473
  "mode": "mxfp4"
1474
  },
1475
  "model.layers.45.mlp.switch_mlp.down_proj": {
1476
- "group_size": 64,
1477
- "bits": 6,
1478
- "mode": "affine"
1479
  },
1480
  "model.layers.46.linear_attn.in_proj_qkvz": {
1481
  "group_size": 64,
1482
- "bits": 5,
1483
  "mode": "affine"
1484
  },
1485
  "model.layers.46.linear_attn.in_proj_ba": {
1486
  "group_size": 64,
1487
- "bits": 4,
1488
  "mode": "affine"
1489
  },
1490
  "model.layers.46.linear_attn.out_proj": {
@@ -1503,28 +1503,28 @@
1503
  "mode": "mxfp4"
1504
  },
1505
  "model.layers.46.mlp.switch_mlp.down_proj": {
1506
- "group_size": 64,
1507
- "bits": 6,
1508
- "mode": "affine"
1509
  },
1510
  "model.layers.47.self_attn.q_proj": {
1511
  "group_size": 64,
1512
- "bits": 5,
1513
  "mode": "affine"
1514
  },
1515
  "model.layers.47.self_attn.k_proj": {
1516
  "group_size": 64,
1517
- "bits": 5,
1518
  "mode": "affine"
1519
  },
1520
  "model.layers.47.self_attn.v_proj": {
1521
  "group_size": 64,
1522
- "bits": 5,
1523
  "mode": "affine"
1524
  },
1525
  "model.layers.47.self_attn.o_proj": {
1526
  "group_size": 64,
1527
- "bits": 5,
1528
  "mode": "affine"
1529
  },
1530
  "model.layers.47.mlp.switch_mlp.gate_proj": {
@@ -1538,13 +1538,13 @@
1538
  "mode": "mxfp4"
1539
  },
1540
  "model.layers.47.mlp.switch_mlp.down_proj": {
1541
- "group_size": 64,
1542
- "bits": 6,
1543
- "mode": "affine"
1544
  },
1545
  "lm_head": {
1546
  "group_size": 64,
1547
- "bits": 6,
1548
  "mode": "affine"
1549
  }
1550
  },
@@ -1554,22 +1554,22 @@
1554
  "mode": "affine",
1555
  "model.embed_tokens": {
1556
  "group_size": 64,
1557
- "bits": 4,
1558
  "mode": "affine"
1559
  },
1560
  "model.layers.0.linear_attn.in_proj_qkvz": {
1561
- "group_size": 32,
1562
- "bits": 4,
1563
- "mode": "mxfp4"
1564
  },
1565
  "model.layers.0.linear_attn.in_proj_ba": {
1566
  "group_size": 64,
1567
- "bits": 4,
1568
  "mode": "affine"
1569
  },
1570
  "model.layers.0.linear_attn.out_proj": {
1571
  "group_size": 64,
1572
- "bits": 5,
1573
  "mode": "affine"
1574
  },
1575
  "model.layers.0.mlp.switch_mlp.gate_proj": {
@@ -1588,18 +1588,18 @@
1588
  "mode": "mxfp4"
1589
  },
1590
  "model.layers.1.linear_attn.in_proj_qkvz": {
1591
- "group_size": 32,
1592
- "bits": 4,
1593
- "mode": "mxfp4"
1594
  },
1595
  "model.layers.1.linear_attn.in_proj_ba": {
1596
  "group_size": 64,
1597
- "bits": 4,
1598
  "mode": "affine"
1599
  },
1600
  "model.layers.1.linear_attn.out_proj": {
1601
  "group_size": 64,
1602
- "bits": 5,
1603
  "mode": "affine"
1604
  },
1605
  "model.layers.1.mlp.switch_mlp.gate_proj": {
@@ -1613,23 +1613,23 @@
1613
  "mode": "mxfp4"
1614
  },
1615
  "model.layers.1.mlp.switch_mlp.down_proj": {
1616
- "group_size": 64,
1617
- "bits": 6,
1618
- "mode": "affine"
1619
- },
1620
- "model.layers.2.linear_attn.in_proj_qkvz": {
1621
  "group_size": 32,
1622
  "bits": 4,
1623
  "mode": "mxfp4"
1624
  },
 
 
 
 
 
1625
  "model.layers.2.linear_attn.in_proj_ba": {
1626
  "group_size": 64,
1627
- "bits": 4,
1628
  "mode": "affine"
1629
  },
1630
  "model.layers.2.linear_attn.out_proj": {
1631
  "group_size": 64,
1632
- "bits": 5,
1633
  "mode": "affine"
1634
  },
1635
  "model.layers.2.mlp.switch_mlp.gate_proj": {
@@ -1648,24 +1648,24 @@
1648
  "mode": "mxfp4"
1649
  },
1650
  "model.layers.3.self_attn.q_proj": {
1651
- "group_size": 32,
1652
- "bits": 4,
1653
- "mode": "mxfp4"
1654
  },
1655
  "model.layers.3.self_attn.k_proj": {
1656
- "group_size": 32,
1657
- "bits": 4,
1658
- "mode": "mxfp4"
1659
  },
1660
  "model.layers.3.self_attn.v_proj": {
1661
- "group_size": 32,
1662
- "bits": 4,
1663
- "mode": "mxfp4"
1664
  },
1665
  "model.layers.3.self_attn.o_proj": {
1666
- "group_size": 32,
1667
- "bits": 4,
1668
- "mode": "mxfp4"
1669
  },
1670
  "model.layers.3.mlp.switch_mlp.gate_proj": {
1671
  "group_size": 32,
@@ -1683,18 +1683,18 @@
1683
  "mode": "mxfp4"
1684
  },
1685
  "model.layers.4.linear_attn.in_proj_qkvz": {
1686
- "group_size": 32,
1687
- "bits": 4,
1688
- "mode": "mxfp4"
1689
  },
1690
  "model.layers.4.linear_attn.in_proj_ba": {
1691
  "group_size": 64,
1692
- "bits": 4,
1693
  "mode": "affine"
1694
  },
1695
  "model.layers.4.linear_attn.out_proj": {
1696
  "group_size": 64,
1697
- "bits": 5,
1698
  "mode": "affine"
1699
  },
1700
  "model.layers.4.mlp.switch_mlp.gate_proj": {
@@ -1713,18 +1713,18 @@
1713
  "mode": "mxfp4"
1714
  },
1715
  "model.layers.5.linear_attn.in_proj_qkvz": {
1716
- "group_size": 32,
1717
- "bits": 4,
1718
- "mode": "mxfp4"
1719
  },
1720
  "model.layers.5.linear_attn.in_proj_ba": {
1721
  "group_size": 64,
1722
- "bits": 4,
1723
  "mode": "affine"
1724
  },
1725
  "model.layers.5.linear_attn.out_proj": {
1726
  "group_size": 64,
1727
- "bits": 5,
1728
  "mode": "affine"
1729
  },
1730
  "model.layers.5.mlp.switch_mlp.gate_proj": {
@@ -1743,18 +1743,18 @@
1743
  "mode": "mxfp4"
1744
  },
1745
  "model.layers.6.linear_attn.in_proj_qkvz": {
1746
- "group_size": 32,
1747
- "bits": 4,
1748
- "mode": "mxfp4"
1749
  },
1750
  "model.layers.6.linear_attn.in_proj_ba": {
1751
  "group_size": 64,
1752
- "bits": 4,
1753
  "mode": "affine"
1754
  },
1755
  "model.layers.6.linear_attn.out_proj": {
1756
  "group_size": 64,
1757
- "bits": 5,
1758
  "mode": "affine"
1759
  },
1760
  "model.layers.6.mlp.switch_mlp.gate_proj": {
@@ -1768,29 +1768,29 @@
1768
  "mode": "mxfp4"
1769
  },
1770
  "model.layers.6.mlp.switch_mlp.down_proj": {
1771
- "group_size": 64,
1772
- "bits": 6,
1773
- "mode": "affine"
1774
- },
1775
- "model.layers.7.self_attn.q_proj": {
1776
  "group_size": 32,
1777
  "bits": 4,
1778
  "mode": "mxfp4"
1779
  },
 
 
 
 
 
1780
  "model.layers.7.self_attn.k_proj": {
1781
- "group_size": 32,
1782
- "bits": 4,
1783
- "mode": "mxfp4"
1784
  },
1785
  "model.layers.7.self_attn.v_proj": {
1786
- "group_size": 32,
1787
- "bits": 4,
1788
- "mode": "mxfp4"
1789
  },
1790
  "model.layers.7.self_attn.o_proj": {
1791
- "group_size": 32,
1792
- "bits": 4,
1793
- "mode": "mxfp4"
1794
  },
1795
  "model.layers.7.mlp.switch_mlp.gate_proj": {
1796
  "group_size": 32,
@@ -1808,18 +1808,18 @@
1808
  "mode": "mxfp4"
1809
  },
1810
  "model.layers.8.linear_attn.in_proj_qkvz": {
1811
- "group_size": 32,
1812
- "bits": 4,
1813
- "mode": "mxfp4"
1814
  },
1815
  "model.layers.8.linear_attn.in_proj_ba": {
1816
  "group_size": 64,
1817
- "bits": 4,
1818
  "mode": "affine"
1819
  },
1820
  "model.layers.8.linear_attn.out_proj": {
1821
  "group_size": 64,
1822
- "bits": 5,
1823
  "mode": "affine"
1824
  },
1825
  "model.layers.8.mlp.switch_mlp.gate_proj": {
@@ -1838,18 +1838,18 @@
1838
  "mode": "mxfp4"
1839
  },
1840
  "model.layers.9.linear_attn.in_proj_qkvz": {
1841
- "group_size": 32,
1842
- "bits": 4,
1843
- "mode": "mxfp4"
1844
  },
1845
  "model.layers.9.linear_attn.in_proj_ba": {
1846
  "group_size": 64,
1847
- "bits": 4,
1848
  "mode": "affine"
1849
  },
1850
  "model.layers.9.linear_attn.out_proj": {
1851
  "group_size": 64,
1852
- "bits": 5,
1853
  "mode": "affine"
1854
  },
1855
  "model.layers.9.mlp.switch_mlp.gate_proj": {
@@ -1868,18 +1868,18 @@
1868
  "mode": "mxfp4"
1869
  },
1870
  "model.layers.10.linear_attn.in_proj_qkvz": {
1871
- "group_size": 32,
1872
- "bits": 4,
1873
- "mode": "mxfp4"
1874
  },
1875
  "model.layers.10.linear_attn.in_proj_ba": {
1876
  "group_size": 64,
1877
- "bits": 4,
1878
  "mode": "affine"
1879
  },
1880
  "model.layers.10.linear_attn.out_proj": {
1881
  "group_size": 64,
1882
- "bits": 5,
1883
  "mode": "affine"
1884
  },
1885
  "model.layers.10.mlp.switch_mlp.gate_proj": {
@@ -1893,29 +1893,29 @@
1893
  "mode": "mxfp4"
1894
  },
1895
  "model.layers.10.mlp.switch_mlp.down_proj": {
1896
- "group_size": 64,
1897
- "bits": 6,
1898
- "mode": "affine"
1899
- },
1900
- "model.layers.11.self_attn.q_proj": {
1901
  "group_size": 32,
1902
  "bits": 4,
1903
  "mode": "mxfp4"
1904
  },
 
 
 
 
 
1905
  "model.layers.11.self_attn.k_proj": {
1906
- "group_size": 32,
1907
- "bits": 4,
1908
- "mode": "mxfp4"
1909
  },
1910
  "model.layers.11.self_attn.v_proj": {
1911
- "group_size": 32,
1912
- "bits": 4,
1913
- "mode": "mxfp4"
1914
  },
1915
  "model.layers.11.self_attn.o_proj": {
1916
- "group_size": 32,
1917
- "bits": 4,
1918
- "mode": "mxfp4"
1919
  },
1920
  "model.layers.11.mlp.switch_mlp.gate_proj": {
1921
  "group_size": 32,
@@ -1933,18 +1933,18 @@
1933
  "mode": "mxfp4"
1934
  },
1935
  "model.layers.12.linear_attn.in_proj_qkvz": {
1936
- "group_size": 32,
1937
- "bits": 4,
1938
- "mode": "mxfp4"
1939
  },
1940
  "model.layers.12.linear_attn.in_proj_ba": {
1941
  "group_size": 64,
1942
- "bits": 4,
1943
  "mode": "affine"
1944
  },
1945
  "model.layers.12.linear_attn.out_proj": {
1946
  "group_size": 64,
1947
- "bits": 6,
1948
  "mode": "affine"
1949
  },
1950
  "model.layers.12.mlp.switch_mlp.gate_proj": {
@@ -1963,18 +1963,18 @@
1963
  "mode": "mxfp4"
1964
  },
1965
  "model.layers.13.linear_attn.in_proj_qkvz": {
1966
- "group_size": 32,
1967
- "bits": 4,
1968
- "mode": "mxfp4"
1969
  },
1970
  "model.layers.13.linear_attn.in_proj_ba": {
1971
  "group_size": 64,
1972
- "bits": 4,
1973
  "mode": "affine"
1974
  },
1975
  "model.layers.13.linear_attn.out_proj": {
1976
  "group_size": 64,
1977
- "bits": 5,
1978
  "mode": "affine"
1979
  },
1980
  "model.layers.13.mlp.switch_mlp.gate_proj": {
@@ -1993,18 +1993,18 @@
1993
  "mode": "mxfp4"
1994
  },
1995
  "model.layers.14.linear_attn.in_proj_qkvz": {
1996
- "group_size": 32,
1997
- "bits": 4,
1998
- "mode": "mxfp4"
1999
  },
2000
  "model.layers.14.linear_attn.in_proj_ba": {
2001
  "group_size": 64,
2002
- "bits": 4,
2003
  "mode": "affine"
2004
  },
2005
  "model.layers.14.linear_attn.out_proj": {
2006
  "group_size": 64,
2007
- "bits": 5,
2008
  "mode": "affine"
2009
  },
2010
  "model.layers.14.mlp.switch_mlp.gate_proj": {
@@ -2018,29 +2018,29 @@
2018
  "mode": "mxfp4"
2019
  },
2020
  "model.layers.14.mlp.switch_mlp.down_proj": {
2021
- "group_size": 64,
2022
- "bits": 6,
2023
- "mode": "affine"
2024
- },
2025
- "model.layers.15.self_attn.q_proj": {
2026
  "group_size": 32,
2027
  "bits": 4,
2028
  "mode": "mxfp4"
2029
  },
 
 
 
 
 
2030
  "model.layers.15.self_attn.k_proj": {
2031
- "group_size": 32,
2032
- "bits": 4,
2033
- "mode": "mxfp4"
2034
  },
2035
  "model.layers.15.self_attn.v_proj": {
2036
- "group_size": 32,
2037
- "bits": 4,
2038
- "mode": "mxfp4"
2039
  },
2040
  "model.layers.15.self_attn.o_proj": {
2041
- "group_size": 32,
2042
- "bits": 4,
2043
- "mode": "mxfp4"
2044
  },
2045
  "model.layers.15.mlp.switch_mlp.gate_proj": {
2046
  "group_size": 32,
@@ -2058,18 +2058,18 @@
2058
  "mode": "mxfp4"
2059
  },
2060
  "model.layers.16.linear_attn.in_proj_qkvz": {
2061
- "group_size": 32,
2062
- "bits": 4,
2063
- "mode": "mxfp4"
2064
  },
2065
  "model.layers.16.linear_attn.in_proj_ba": {
2066
  "group_size": 64,
2067
- "bits": 4,
2068
  "mode": "affine"
2069
  },
2070
  "model.layers.16.linear_attn.out_proj": {
2071
  "group_size": 64,
2072
- "bits": 6,
2073
  "mode": "affine"
2074
  },
2075
  "model.layers.16.mlp.switch_mlp.gate_proj": {
@@ -2088,18 +2088,18 @@
2088
  "mode": "mxfp4"
2089
  },
2090
  "model.layers.17.linear_attn.in_proj_qkvz": {
2091
- "group_size": 32,
2092
- "bits": 4,
2093
- "mode": "mxfp4"
2094
  },
2095
  "model.layers.17.linear_attn.in_proj_ba": {
2096
  "group_size": 64,
2097
- "bits": 4,
2098
  "mode": "affine"
2099
  },
2100
  "model.layers.17.linear_attn.out_proj": {
2101
  "group_size": 64,
2102
- "bits": 6,
2103
  "mode": "affine"
2104
  },
2105
  "model.layers.17.mlp.switch_mlp.gate_proj": {
@@ -2118,18 +2118,18 @@
2118
  "mode": "mxfp4"
2119
  },
2120
  "model.layers.18.linear_attn.in_proj_qkvz": {
2121
- "group_size": 32,
2122
- "bits": 4,
2123
- "mode": "mxfp4"
2124
  },
2125
  "model.layers.18.linear_attn.in_proj_ba": {
2126
  "group_size": 64,
2127
- "bits": 4,
2128
  "mode": "affine"
2129
  },
2130
  "model.layers.18.linear_attn.out_proj": {
2131
  "group_size": 64,
2132
- "bits": 5,
2133
  "mode": "affine"
2134
  },
2135
  "model.layers.18.mlp.switch_mlp.gate_proj": {
@@ -2148,24 +2148,24 @@
2148
  "mode": "mxfp4"
2149
  },
2150
  "model.layers.19.self_attn.q_proj": {
2151
- "group_size": 32,
2152
- "bits": 4,
2153
- "mode": "mxfp4"
2154
  },
2155
  "model.layers.19.self_attn.k_proj": {
2156
- "group_size": 32,
2157
- "bits": 4,
2158
- "mode": "mxfp4"
2159
  },
2160
  "model.layers.19.self_attn.v_proj": {
2161
- "group_size": 32,
2162
- "bits": 4,
2163
- "mode": "mxfp4"
2164
  },
2165
  "model.layers.19.self_attn.o_proj": {
2166
- "group_size": 32,
2167
- "bits": 4,
2168
- "mode": "mxfp4"
2169
  },
2170
  "model.layers.19.mlp.switch_mlp.gate_proj": {
2171
  "group_size": 32,
@@ -2183,18 +2183,18 @@
2183
  "mode": "mxfp4"
2184
  },
2185
  "model.layers.20.linear_attn.in_proj_qkvz": {
2186
- "group_size": 32,
2187
- "bits": 4,
2188
- "mode": "mxfp4"
2189
  },
2190
  "model.layers.20.linear_attn.in_proj_ba": {
2191
  "group_size": 64,
2192
- "bits": 4,
2193
  "mode": "affine"
2194
  },
2195
  "model.layers.20.linear_attn.out_proj": {
2196
  "group_size": 64,
2197
- "bits": 5,
2198
  "mode": "affine"
2199
  },
2200
  "model.layers.20.mlp.switch_mlp.gate_proj": {
@@ -2213,18 +2213,18 @@
2213
  "mode": "mxfp4"
2214
  },
2215
  "model.layers.21.linear_attn.in_proj_qkvz": {
2216
- "group_size": 32,
2217
- "bits": 4,
2218
- "mode": "mxfp4"
2219
  },
2220
  "model.layers.21.linear_attn.in_proj_ba": {
2221
  "group_size": 64,
2222
- "bits": 4,
2223
  "mode": "affine"
2224
  },
2225
  "model.layers.21.linear_attn.out_proj": {
2226
  "group_size": 64,
2227
- "bits": 5,
2228
  "mode": "affine"
2229
  },
2230
  "model.layers.21.mlp.switch_mlp.gate_proj": {
@@ -2243,18 +2243,18 @@
2243
  "mode": "mxfp4"
2244
  },
2245
  "model.layers.22.linear_attn.in_proj_qkvz": {
2246
- "group_size": 32,
2247
- "bits": 4,
2248
- "mode": "mxfp4"
2249
  },
2250
  "model.layers.22.linear_attn.in_proj_ba": {
2251
  "group_size": 64,
2252
- "bits": 4,
2253
  "mode": "affine"
2254
  },
2255
  "model.layers.22.linear_attn.out_proj": {
2256
  "group_size": 64,
2257
- "bits": 5,
2258
  "mode": "affine"
2259
  },
2260
  "model.layers.22.mlp.switch_mlp.gate_proj": {
@@ -2268,29 +2268,29 @@
2268
  "mode": "mxfp4"
2269
  },
2270
  "model.layers.22.mlp.switch_mlp.down_proj": {
2271
- "group_size": 64,
2272
- "bits": 6,
2273
- "mode": "affine"
2274
- },
2275
- "model.layers.23.self_attn.q_proj": {
2276
  "group_size": 32,
2277
  "bits": 4,
2278
  "mode": "mxfp4"
2279
  },
 
 
 
 
 
2280
  "model.layers.23.self_attn.k_proj": {
2281
- "group_size": 32,
2282
- "bits": 4,
2283
- "mode": "mxfp4"
2284
  },
2285
  "model.layers.23.self_attn.v_proj": {
2286
- "group_size": 32,
2287
- "bits": 4,
2288
- "mode": "mxfp4"
2289
  },
2290
  "model.layers.23.self_attn.o_proj": {
2291
- "group_size": 32,
2292
- "bits": 4,
2293
- "mode": "mxfp4"
2294
  },
2295
  "model.layers.23.mlp.switch_mlp.gate_proj": {
2296
  "group_size": 32,
@@ -2308,18 +2308,18 @@
2308
  "mode": "mxfp4"
2309
  },
2310
  "model.layers.24.linear_attn.in_proj_qkvz": {
2311
- "group_size": 32,
2312
- "bits": 4,
2313
- "mode": "mxfp4"
2314
  },
2315
  "model.layers.24.linear_attn.in_proj_ba": {
2316
  "group_size": 64,
2317
- "bits": 4,
2318
  "mode": "affine"
2319
  },
2320
  "model.layers.24.linear_attn.out_proj": {
2321
  "group_size": 64,
2322
- "bits": 5,
2323
  "mode": "affine"
2324
  },
2325
  "model.layers.24.mlp.switch_mlp.gate_proj": {
@@ -2338,18 +2338,18 @@
2338
  "mode": "mxfp4"
2339
  },
2340
  "model.layers.25.linear_attn.in_proj_qkvz": {
2341
- "group_size": 32,
2342
- "bits": 4,
2343
- "mode": "mxfp4"
2344
  },
2345
  "model.layers.25.linear_attn.in_proj_ba": {
2346
  "group_size": 64,
2347
- "bits": 4,
2348
  "mode": "affine"
2349
  },
2350
  "model.layers.25.linear_attn.out_proj": {
2351
  "group_size": 64,
2352
- "bits": 5,
2353
  "mode": "affine"
2354
  },
2355
  "model.layers.25.mlp.switch_mlp.gate_proj": {
@@ -2368,18 +2368,18 @@
2368
  "mode": "mxfp4"
2369
  },
2370
  "model.layers.26.linear_attn.in_proj_qkvz": {
2371
- "group_size": 32,
2372
- "bits": 4,
2373
- "mode": "mxfp4"
2374
  },
2375
  "model.layers.26.linear_attn.in_proj_ba": {
2376
  "group_size": 64,
2377
- "bits": 4,
2378
  "mode": "affine"
2379
  },
2380
  "model.layers.26.linear_attn.out_proj": {
2381
  "group_size": 64,
2382
- "bits": 5,
2383
  "mode": "affine"
2384
  },
2385
  "model.layers.26.mlp.switch_mlp.gate_proj": {
@@ -2392,30 +2392,30 @@
2392
  "bits": 4,
2393
  "mode": "mxfp4"
2394
  },
2395
- "model.layers.26.mlp.switch_mlp.down_proj": {
2396
- "group_size": 32,
2397
- "bits": 4,
2398
- "mode": "mxfp4"
2399
- },
2400
- "model.layers.27.self_attn.q_proj": {
2401
- "group_size": 32,
2402
- "bits": 4,
2403
- "mode": "mxfp4"
2404
- },
2405
- "model.layers.27.self_attn.k_proj": {
2406
- "group_size": 32,
2407
- "bits": 4,
2408
- "mode": "mxfp4"
2409
- },
2410
- "model.layers.27.self_attn.v_proj": {
2411
  "group_size": 32,
2412
  "bits": 4,
2413
  "mode": "mxfp4"
2414
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2415
  "model.layers.27.self_attn.o_proj": {
2416
- "group_size": 32,
2417
- "bits": 4,
2418
- "mode": "mxfp4"
2419
  },
2420
  "model.layers.27.mlp.switch_mlp.gate_proj": {
2421
  "group_size": 32,
@@ -2433,18 +2433,18 @@
2433
  "mode": "mxfp4"
2434
  },
2435
  "model.layers.28.linear_attn.in_proj_qkvz": {
2436
- "group_size": 32,
2437
- "bits": 4,
2438
- "mode": "mxfp4"
2439
  },
2440
  "model.layers.28.linear_attn.in_proj_ba": {
2441
  "group_size": 64,
2442
- "bits": 4,
2443
  "mode": "affine"
2444
  },
2445
  "model.layers.28.linear_attn.out_proj": {
2446
  "group_size": 64,
2447
- "bits": 5,
2448
  "mode": "affine"
2449
  },
2450
  "model.layers.28.mlp.switch_mlp.gate_proj": {
@@ -2463,18 +2463,18 @@
2463
  "mode": "mxfp4"
2464
  },
2465
  "model.layers.29.linear_attn.in_proj_qkvz": {
2466
- "group_size": 32,
2467
- "bits": 4,
2468
- "mode": "mxfp4"
2469
  },
2470
  "model.layers.29.linear_attn.in_proj_ba": {
2471
  "group_size": 64,
2472
- "bits": 4,
2473
  "mode": "affine"
2474
  },
2475
  "model.layers.29.linear_attn.out_proj": {
2476
  "group_size": 64,
2477
- "bits": 5,
2478
  "mode": "affine"
2479
  },
2480
  "model.layers.29.mlp.switch_mlp.gate_proj": {
@@ -2493,18 +2493,18 @@
2493
  "mode": "mxfp4"
2494
  },
2495
  "model.layers.30.linear_attn.in_proj_qkvz": {
2496
- "group_size": 32,
2497
- "bits": 4,
2498
- "mode": "mxfp4"
2499
  },
2500
  "model.layers.30.linear_attn.in_proj_ba": {
2501
  "group_size": 64,
2502
- "bits": 4,
2503
  "mode": "affine"
2504
  },
2505
  "model.layers.30.linear_attn.out_proj": {
2506
  "group_size": 64,
2507
- "bits": 6,
2508
  "mode": "affine"
2509
  },
2510
  "model.layers.30.mlp.switch_mlp.gate_proj": {
@@ -2523,24 +2523,24 @@
2523
  "mode": "mxfp4"
2524
  },
2525
  "model.layers.31.self_attn.q_proj": {
2526
- "group_size": 32,
2527
- "bits": 4,
2528
- "mode": "mxfp4"
2529
  },
2530
  "model.layers.31.self_attn.k_proj": {
2531
- "group_size": 32,
2532
- "bits": 4,
2533
- "mode": "mxfp4"
2534
  },
2535
  "model.layers.31.self_attn.v_proj": {
2536
- "group_size": 32,
2537
- "bits": 4,
2538
- "mode": "mxfp4"
2539
  },
2540
  "model.layers.31.self_attn.o_proj": {
2541
- "group_size": 32,
2542
- "bits": 4,
2543
- "mode": "mxfp4"
2544
  },
2545
  "model.layers.31.mlp.switch_mlp.gate_proj": {
2546
  "group_size": 32,
@@ -2553,23 +2553,23 @@
2553
  "mode": "mxfp4"
2554
  },
2555
  "model.layers.31.mlp.switch_mlp.down_proj": {
2556
- "group_size": 64,
2557
- "bits": 6,
2558
- "mode": "affine"
2559
- },
2560
- "model.layers.32.linear_attn.in_proj_qkvz": {
2561
  "group_size": 32,
2562
  "bits": 4,
2563
  "mode": "mxfp4"
2564
  },
 
 
 
 
 
2565
  "model.layers.32.linear_attn.in_proj_ba": {
2566
  "group_size": 64,
2567
- "bits": 4,
2568
  "mode": "affine"
2569
  },
2570
  "model.layers.32.linear_attn.out_proj": {
2571
  "group_size": 64,
2572
- "bits": 6,
2573
  "mode": "affine"
2574
  },
2575
  "model.layers.32.mlp.switch_mlp.gate_proj": {
@@ -2588,18 +2588,18 @@
2588
  "mode": "mxfp4"
2589
  },
2590
  "model.layers.33.linear_attn.in_proj_qkvz": {
2591
- "group_size": 32,
2592
- "bits": 4,
2593
- "mode": "mxfp4"
2594
  },
2595
  "model.layers.33.linear_attn.in_proj_ba": {
2596
  "group_size": 64,
2597
- "bits": 4,
2598
  "mode": "affine"
2599
  },
2600
  "model.layers.33.linear_attn.out_proj": {
2601
  "group_size": 64,
2602
- "bits": 5,
2603
  "mode": "affine"
2604
  },
2605
  "model.layers.33.mlp.switch_mlp.gate_proj": {
@@ -2618,18 +2618,18 @@
2618
  "mode": "mxfp4"
2619
  },
2620
  "model.layers.34.linear_attn.in_proj_qkvz": {
2621
- "group_size": 32,
2622
- "bits": 4,
2623
- "mode": "mxfp4"
2624
  },
2625
  "model.layers.34.linear_attn.in_proj_ba": {
2626
  "group_size": 64,
2627
- "bits": 4,
2628
  "mode": "affine"
2629
  },
2630
  "model.layers.34.linear_attn.out_proj": {
2631
  "group_size": 64,
2632
- "bits": 6,
2633
  "mode": "affine"
2634
  },
2635
  "model.layers.34.mlp.switch_mlp.gate_proj": {
@@ -2648,24 +2648,24 @@
2648
  "mode": "mxfp4"
2649
  },
2650
  "model.layers.35.self_attn.q_proj": {
2651
- "group_size": 32,
2652
- "bits": 4,
2653
- "mode": "mxfp4"
2654
  },
2655
  "model.layers.35.self_attn.k_proj": {
2656
- "group_size": 32,
2657
- "bits": 4,
2658
- "mode": "mxfp4"
2659
  },
2660
  "model.layers.35.self_attn.v_proj": {
2661
- "group_size": 32,
2662
- "bits": 4,
2663
- "mode": "mxfp4"
2664
  },
2665
  "model.layers.35.self_attn.o_proj": {
2666
- "group_size": 32,
2667
- "bits": 4,
2668
- "mode": "mxfp4"
2669
  },
2670
  "model.layers.35.mlp.switch_mlp.gate_proj": {
2671
  "group_size": 32,
@@ -2683,18 +2683,18 @@
2683
  "mode": "mxfp4"
2684
  },
2685
  "model.layers.36.linear_attn.in_proj_qkvz": {
2686
- "group_size": 32,
2687
- "bits": 4,
2688
- "mode": "mxfp4"
2689
  },
2690
  "model.layers.36.linear_attn.in_proj_ba": {
2691
  "group_size": 64,
2692
- "bits": 4,
2693
  "mode": "affine"
2694
  },
2695
  "model.layers.36.linear_attn.out_proj": {
2696
  "group_size": 64,
2697
- "bits": 5,
2698
  "mode": "affine"
2699
  },
2700
  "model.layers.36.mlp.switch_mlp.gate_proj": {
@@ -2713,18 +2713,18 @@
2713
  "mode": "mxfp4"
2714
  },
2715
  "model.layers.37.linear_attn.in_proj_qkvz": {
2716
- "group_size": 32,
2717
- "bits": 4,
2718
- "mode": "mxfp4"
2719
  },
2720
  "model.layers.37.linear_attn.in_proj_ba": {
2721
  "group_size": 64,
2722
- "bits": 4,
2723
  "mode": "affine"
2724
  },
2725
  "model.layers.37.linear_attn.out_proj": {
2726
  "group_size": 64,
2727
- "bits": 5,
2728
  "mode": "affine"
2729
  },
2730
  "model.layers.37.mlp.switch_mlp.gate_proj": {
@@ -2743,18 +2743,18 @@
2743
  "mode": "mxfp4"
2744
  },
2745
  "model.layers.38.linear_attn.in_proj_qkvz": {
2746
- "group_size": 32,
2747
- "bits": 4,
2748
- "mode": "mxfp4"
2749
  },
2750
  "model.layers.38.linear_attn.in_proj_ba": {
2751
  "group_size": 64,
2752
- "bits": 4,
2753
  "mode": "affine"
2754
  },
2755
  "model.layers.38.linear_attn.out_proj": {
2756
  "group_size": 64,
2757
- "bits": 5,
2758
  "mode": "affine"
2759
  },
2760
  "model.layers.38.mlp.switch_mlp.gate_proj": {
@@ -2773,24 +2773,24 @@
2773
  "mode": "mxfp4"
2774
  },
2775
  "model.layers.39.self_attn.q_proj": {
2776
- "group_size": 32,
2777
- "bits": 4,
2778
- "mode": "mxfp4"
2779
  },
2780
  "model.layers.39.self_attn.k_proj": {
2781
- "group_size": 32,
2782
- "bits": 4,
2783
- "mode": "mxfp4"
2784
  },
2785
  "model.layers.39.self_attn.v_proj": {
2786
- "group_size": 32,
2787
- "bits": 4,
2788
- "mode": "mxfp4"
2789
  },
2790
  "model.layers.39.self_attn.o_proj": {
2791
- "group_size": 32,
2792
- "bits": 4,
2793
- "mode": "mxfp4"
2794
  },
2795
  "model.layers.39.mlp.switch_mlp.gate_proj": {
2796
  "group_size": 32,
@@ -2803,23 +2803,23 @@
2803
  "mode": "mxfp4"
2804
  },
2805
  "model.layers.39.mlp.switch_mlp.down_proj": {
2806
- "group_size": 64,
2807
- "bits": 6,
2808
- "mode": "affine"
2809
- },
2810
- "model.layers.40.linear_attn.in_proj_qkvz": {
2811
  "group_size": 32,
2812
  "bits": 4,
2813
  "mode": "mxfp4"
2814
  },
 
 
 
 
 
2815
  "model.layers.40.linear_attn.in_proj_ba": {
2816
  "group_size": 64,
2817
- "bits": 4,
2818
  "mode": "affine"
2819
  },
2820
  "model.layers.40.linear_attn.out_proj": {
2821
  "group_size": 64,
2822
- "bits": 5,
2823
  "mode": "affine"
2824
  },
2825
  "model.layers.40.mlp.switch_mlp.gate_proj": {
@@ -2838,18 +2838,18 @@
2838
  "mode": "mxfp4"
2839
  },
2840
  "model.layers.41.linear_attn.in_proj_qkvz": {
2841
- "group_size": 32,
2842
- "bits": 4,
2843
- "mode": "mxfp4"
2844
  },
2845
  "model.layers.41.linear_attn.in_proj_ba": {
2846
  "group_size": 64,
2847
- "bits": 4,
2848
  "mode": "affine"
2849
  },
2850
  "model.layers.41.linear_attn.out_proj": {
2851
  "group_size": 64,
2852
- "bits": 5,
2853
  "mode": "affine"
2854
  },
2855
  "model.layers.41.mlp.switch_mlp.gate_proj": {
@@ -2868,18 +2868,18 @@
2868
  "mode": "mxfp4"
2869
  },
2870
  "model.layers.42.linear_attn.in_proj_qkvz": {
2871
- "group_size": 32,
2872
- "bits": 4,
2873
- "mode": "mxfp4"
2874
  },
2875
  "model.layers.42.linear_attn.in_proj_ba": {
2876
  "group_size": 64,
2877
- "bits": 4,
2878
  "mode": "affine"
2879
  },
2880
  "model.layers.42.linear_attn.out_proj": {
2881
  "group_size": 64,
2882
- "bits": 5,
2883
  "mode": "affine"
2884
  },
2885
  "model.layers.42.mlp.switch_mlp.gate_proj": {
@@ -2893,28 +2893,28 @@
2893
  "mode": "mxfp4"
2894
  },
2895
  "model.layers.42.mlp.switch_mlp.down_proj": {
2896
- "group_size": 64,
2897
- "bits": 6,
2898
- "mode": "affine"
2899
  },
2900
  "model.layers.43.self_attn.q_proj": {
2901
  "group_size": 64,
2902
- "bits": 5,
2903
  "mode": "affine"
2904
  },
2905
  "model.layers.43.self_attn.k_proj": {
2906
  "group_size": 64,
2907
- "bits": 5,
2908
  "mode": "affine"
2909
  },
2910
  "model.layers.43.self_attn.v_proj": {
2911
  "group_size": 64,
2912
- "bits": 5,
2913
  "mode": "affine"
2914
  },
2915
  "model.layers.43.self_attn.o_proj": {
2916
  "group_size": 64,
2917
- "bits": 5,
2918
  "mode": "affine"
2919
  },
2920
  "model.layers.43.mlp.switch_mlp.gate_proj": {
@@ -2928,23 +2928,23 @@
2928
  "mode": "mxfp4"
2929
  },
2930
  "model.layers.43.mlp.switch_mlp.down_proj": {
2931
- "group_size": 64,
2932
- "bits": 6,
2933
- "mode": "affine"
2934
- },
2935
- "model.layers.44.linear_attn.in_proj_qkvz": {
2936
  "group_size": 32,
2937
  "bits": 4,
2938
  "mode": "mxfp4"
2939
  },
 
 
 
 
 
2940
  "model.layers.44.linear_attn.in_proj_ba": {
2941
  "group_size": 64,
2942
- "bits": 4,
2943
  "mode": "affine"
2944
  },
2945
  "model.layers.44.linear_attn.out_proj": {
2946
  "group_size": 64,
2947
- "bits": 6,
2948
  "mode": "affine"
2949
  },
2950
  "model.layers.44.mlp.switch_mlp.gate_proj": {
@@ -2964,12 +2964,12 @@
2964
  },
2965
  "model.layers.45.linear_attn.in_proj_qkvz": {
2966
  "group_size": 64,
2967
- "bits": 5,
2968
  "mode": "affine"
2969
  },
2970
  "model.layers.45.linear_attn.in_proj_ba": {
2971
  "group_size": 64,
2972
- "bits": 4,
2973
  "mode": "affine"
2974
  },
2975
  "model.layers.45.linear_attn.out_proj": {
@@ -2988,18 +2988,18 @@
2988
  "mode": "mxfp4"
2989
  },
2990
  "model.layers.45.mlp.switch_mlp.down_proj": {
2991
- "group_size": 64,
2992
- "bits": 6,
2993
- "mode": "affine"
2994
  },
2995
  "model.layers.46.linear_attn.in_proj_qkvz": {
2996
  "group_size": 64,
2997
- "bits": 5,
2998
  "mode": "affine"
2999
  },
3000
  "model.layers.46.linear_attn.in_proj_ba": {
3001
  "group_size": 64,
3002
- "bits": 4,
3003
  "mode": "affine"
3004
  },
3005
  "model.layers.46.linear_attn.out_proj": {
@@ -3018,28 +3018,28 @@
3018
  "mode": "mxfp4"
3019
  },
3020
  "model.layers.46.mlp.switch_mlp.down_proj": {
3021
- "group_size": 64,
3022
- "bits": 6,
3023
- "mode": "affine"
3024
  },
3025
  "model.layers.47.self_attn.q_proj": {
3026
  "group_size": 64,
3027
- "bits": 5,
3028
  "mode": "affine"
3029
  },
3030
  "model.layers.47.self_attn.k_proj": {
3031
  "group_size": 64,
3032
- "bits": 5,
3033
  "mode": "affine"
3034
  },
3035
  "model.layers.47.self_attn.v_proj": {
3036
  "group_size": 64,
3037
- "bits": 5,
3038
  "mode": "affine"
3039
  },
3040
  "model.layers.47.self_attn.o_proj": {
3041
  "group_size": 64,
3042
- "bits": 5,
3043
  "mode": "affine"
3044
  },
3045
  "model.layers.47.mlp.switch_mlp.gate_proj": {
@@ -3053,13 +3053,13 @@
3053
  "mode": "mxfp4"
3054
  },
3055
  "model.layers.47.mlp.switch_mlp.down_proj": {
3056
- "group_size": 64,
3057
- "bits": 6,
3058
- "mode": "affine"
3059
  },
3060
  "lm_head": {
3061
  "group_size": 64,
3062
- "bits": 6,
3063
  "mode": "affine"
3064
  }
3065
  },
 
39
  "mode": "affine",
40
  "model.embed_tokens": {
41
  "group_size": 64,
42
+ "bits": 8,
43
  "mode": "affine"
44
  },
45
  "model.layers.0.linear_attn.in_proj_qkvz": {
46
+ "group_size": 64,
47
+ "bits": 8,
48
+ "mode": "affine"
49
  },
50
  "model.layers.0.linear_attn.in_proj_ba": {
51
  "group_size": 64,
52
+ "bits": 8,
53
  "mode": "affine"
54
  },
55
  "model.layers.0.linear_attn.out_proj": {
56
  "group_size": 64,
57
+ "bits": 8,
58
  "mode": "affine"
59
  },
60
  "model.layers.0.mlp.switch_mlp.gate_proj": {
 
73
  "mode": "mxfp4"
74
  },
75
  "model.layers.1.linear_attn.in_proj_qkvz": {
76
+ "group_size": 64,
77
+ "bits": 8,
78
+ "mode": "affine"
79
  },
80
  "model.layers.1.linear_attn.in_proj_ba": {
81
  "group_size": 64,
82
+ "bits": 8,
83
  "mode": "affine"
84
  },
85
  "model.layers.1.linear_attn.out_proj": {
86
  "group_size": 64,
87
+ "bits": 8,
88
  "mode": "affine"
89
  },
90
  "model.layers.1.mlp.switch_mlp.gate_proj": {
 
98
  "mode": "mxfp4"
99
  },
100
  "model.layers.1.mlp.switch_mlp.down_proj": {
 
 
 
 
 
101
  "group_size": 32,
102
  "bits": 4,
103
  "mode": "mxfp4"
104
  },
105
+ "model.layers.2.linear_attn.in_proj_qkvz": {
106
+ "group_size": 64,
107
+ "bits": 8,
108
+ "mode": "affine"
109
+ },
110
  "model.layers.2.linear_attn.in_proj_ba": {
111
  "group_size": 64,
112
+ "bits": 8,
113
  "mode": "affine"
114
  },
115
  "model.layers.2.linear_attn.out_proj": {
116
  "group_size": 64,
117
+ "bits": 8,
118
  "mode": "affine"
119
  },
120
  "model.layers.2.mlp.switch_mlp.gate_proj": {
 
133
  "mode": "mxfp4"
134
  },
135
  "model.layers.3.self_attn.q_proj": {
136
+ "group_size": 64,
137
+ "bits": 8,
138
+ "mode": "affine"
139
  },
140
  "model.layers.3.self_attn.k_proj": {
141
+ "group_size": 64,
142
+ "bits": 8,
143
+ "mode": "affine"
144
  },
145
  "model.layers.3.self_attn.v_proj": {
146
+ "group_size": 64,
147
+ "bits": 8,
148
+ "mode": "affine"
149
  },
150
  "model.layers.3.self_attn.o_proj": {
151
+ "group_size": 64,
152
+ "bits": 8,
153
+ "mode": "affine"
154
  },
155
  "model.layers.3.mlp.switch_mlp.gate_proj": {
156
  "group_size": 32,
 
168
  "mode": "mxfp4"
169
  },
170
  "model.layers.4.linear_attn.in_proj_qkvz": {
171
+ "group_size": 64,
172
+ "bits": 8,
173
+ "mode": "affine"
174
  },
175
  "model.layers.4.linear_attn.in_proj_ba": {
176
  "group_size": 64,
177
+ "bits": 8,
178
  "mode": "affine"
179
  },
180
  "model.layers.4.linear_attn.out_proj": {
181
  "group_size": 64,
182
+ "bits": 8,
183
  "mode": "affine"
184
  },
185
  "model.layers.4.mlp.switch_mlp.gate_proj": {
 
198
  "mode": "mxfp4"
199
  },
200
  "model.layers.5.linear_attn.in_proj_qkvz": {
201
+ "group_size": 64,
202
+ "bits": 8,
203
+ "mode": "affine"
204
  },
205
  "model.layers.5.linear_attn.in_proj_ba": {
206
  "group_size": 64,
207
+ "bits": 8,
208
  "mode": "affine"
209
  },
210
  "model.layers.5.linear_attn.out_proj": {
211
  "group_size": 64,
212
+ "bits": 8,
213
  "mode": "affine"
214
  },
215
  "model.layers.5.mlp.switch_mlp.gate_proj": {
 
228
  "mode": "mxfp4"
229
  },
230
  "model.layers.6.linear_attn.in_proj_qkvz": {
231
+ "group_size": 64,
232
+ "bits": 8,
233
+ "mode": "affine"
234
  },
235
  "model.layers.6.linear_attn.in_proj_ba": {
236
  "group_size": 64,
237
+ "bits": 8,
238
  "mode": "affine"
239
  },
240
  "model.layers.6.linear_attn.out_proj": {
241
  "group_size": 64,
242
+ "bits": 8,
243
  "mode": "affine"
244
  },
245
  "model.layers.6.mlp.switch_mlp.gate_proj": {
 
253
  "mode": "mxfp4"
254
  },
255
  "model.layers.6.mlp.switch_mlp.down_proj": {
 
 
 
 
 
256
  "group_size": 32,
257
  "bits": 4,
258
  "mode": "mxfp4"
259
  },
260
+ "model.layers.7.self_attn.q_proj": {
261
+ "group_size": 64,
262
+ "bits": 8,
263
+ "mode": "affine"
264
+ },
265
  "model.layers.7.self_attn.k_proj": {
266
+ "group_size": 64,
267
+ "bits": 8,
268
+ "mode": "affine"
269
  },
270
  "model.layers.7.self_attn.v_proj": {
271
+ "group_size": 64,
272
+ "bits": 8,
273
+ "mode": "affine"
274
  },
275
  "model.layers.7.self_attn.o_proj": {
276
+ "group_size": 64,
277
+ "bits": 8,
278
+ "mode": "affine"
279
  },
280
  "model.layers.7.mlp.switch_mlp.gate_proj": {
281
  "group_size": 32,
 
293
  "mode": "mxfp4"
294
  },
295
  "model.layers.8.linear_attn.in_proj_qkvz": {
296
+ "group_size": 64,
297
+ "bits": 8,
298
+ "mode": "affine"
299
  },
300
  "model.layers.8.linear_attn.in_proj_ba": {
301
  "group_size": 64,
302
+ "bits": 8,
303
  "mode": "affine"
304
  },
305
  "model.layers.8.linear_attn.out_proj": {
306
  "group_size": 64,
307
+ "bits": 8,
308
  "mode": "affine"
309
  },
310
  "model.layers.8.mlp.switch_mlp.gate_proj": {
 
323
  "mode": "mxfp4"
324
  },
325
  "model.layers.9.linear_attn.in_proj_qkvz": {
326
+ "group_size": 64,
327
+ "bits": 8,
328
+ "mode": "affine"
329
  },
330
  "model.layers.9.linear_attn.in_proj_ba": {
331
  "group_size": 64,
332
+ "bits": 8,
333
  "mode": "affine"
334
  },
335
  "model.layers.9.linear_attn.out_proj": {
336
  "group_size": 64,
337
+ "bits": 8,
338
  "mode": "affine"
339
  },
340
  "model.layers.9.mlp.switch_mlp.gate_proj": {
 
353
  "mode": "mxfp4"
354
  },
355
  "model.layers.10.linear_attn.in_proj_qkvz": {
356
+ "group_size": 64,
357
+ "bits": 8,
358
+ "mode": "affine"
359
  },
360
  "model.layers.10.linear_attn.in_proj_ba": {
361
  "group_size": 64,
362
+ "bits": 8,
363
  "mode": "affine"
364
  },
365
  "model.layers.10.linear_attn.out_proj": {
366
  "group_size": 64,
367
+ "bits": 8,
368
  "mode": "affine"
369
  },
370
  "model.layers.10.mlp.switch_mlp.gate_proj": {
 
378
  "mode": "mxfp4"
379
  },
380
  "model.layers.10.mlp.switch_mlp.down_proj": {
 
 
 
 
 
381
  "group_size": 32,
382
  "bits": 4,
383
  "mode": "mxfp4"
384
  },
385
+ "model.layers.11.self_attn.q_proj": {
386
+ "group_size": 64,
387
+ "bits": 8,
388
+ "mode": "affine"
389
+ },
390
  "model.layers.11.self_attn.k_proj": {
391
+ "group_size": 64,
392
+ "bits": 8,
393
+ "mode": "affine"
394
  },
395
  "model.layers.11.self_attn.v_proj": {
396
+ "group_size": 64,
397
+ "bits": 8,
398
+ "mode": "affine"
399
  },
400
  "model.layers.11.self_attn.o_proj": {
401
+ "group_size": 64,
402
+ "bits": 8,
403
+ "mode": "affine"
404
  },
405
  "model.layers.11.mlp.switch_mlp.gate_proj": {
406
  "group_size": 32,
 
418
  "mode": "mxfp4"
419
  },
420
  "model.layers.12.linear_attn.in_proj_qkvz": {
421
+ "group_size": 64,
422
+ "bits": 8,
423
+ "mode": "affine"
424
  },
425
  "model.layers.12.linear_attn.in_proj_ba": {
426
  "group_size": 64,
427
+ "bits": 8,
428
  "mode": "affine"
429
  },
430
  "model.layers.12.linear_attn.out_proj": {
431
  "group_size": 64,
432
+ "bits": 8,
433
  "mode": "affine"
434
  },
435
  "model.layers.12.mlp.switch_mlp.gate_proj": {
 
448
  "mode": "mxfp4"
449
  },
450
  "model.layers.13.linear_attn.in_proj_qkvz": {
451
+ "group_size": 64,
452
+ "bits": 8,
453
+ "mode": "affine"
454
  },
455
  "model.layers.13.linear_attn.in_proj_ba": {
456
  "group_size": 64,
457
+ "bits": 8,
458
  "mode": "affine"
459
  },
460
  "model.layers.13.linear_attn.out_proj": {
461
  "group_size": 64,
462
+ "bits": 8,
463
  "mode": "affine"
464
  },
465
  "model.layers.13.mlp.switch_mlp.gate_proj": {
 
478
  "mode": "mxfp4"
479
  },
480
  "model.layers.14.linear_attn.in_proj_qkvz": {
481
+ "group_size": 64,
482
+ "bits": 8,
483
+ "mode": "affine"
484
  },
485
  "model.layers.14.linear_attn.in_proj_ba": {
486
  "group_size": 64,
487
+ "bits": 8,
488
  "mode": "affine"
489
  },
490
  "model.layers.14.linear_attn.out_proj": {
491
  "group_size": 64,
492
+ "bits": 8,
493
  "mode": "affine"
494
  },
495
  "model.layers.14.mlp.switch_mlp.gate_proj": {
 
503
  "mode": "mxfp4"
504
  },
505
  "model.layers.14.mlp.switch_mlp.down_proj": {
 
 
 
 
 
506
  "group_size": 32,
507
  "bits": 4,
508
  "mode": "mxfp4"
509
  },
510
+ "model.layers.15.self_attn.q_proj": {
511
+ "group_size": 64,
512
+ "bits": 8,
513
+ "mode": "affine"
514
+ },
515
  "model.layers.15.self_attn.k_proj": {
516
+ "group_size": 64,
517
+ "bits": 8,
518
+ "mode": "affine"
519
  },
520
  "model.layers.15.self_attn.v_proj": {
521
+ "group_size": 64,
522
+ "bits": 8,
523
+ "mode": "affine"
524
  },
525
  "model.layers.15.self_attn.o_proj": {
526
+ "group_size": 64,
527
+ "bits": 8,
528
+ "mode": "affine"
529
  },
530
  "model.layers.15.mlp.switch_mlp.gate_proj": {
531
  "group_size": 32,
 
543
  "mode": "mxfp4"
544
  },
545
  "model.layers.16.linear_attn.in_proj_qkvz": {
546
+ "group_size": 64,
547
+ "bits": 8,
548
+ "mode": "affine"
549
  },
550
  "model.layers.16.linear_attn.in_proj_ba": {
551
  "group_size": 64,
552
+ "bits": 8,
553
  "mode": "affine"
554
  },
555
  "model.layers.16.linear_attn.out_proj": {
556
  "group_size": 64,
557
+ "bits": 8,
558
  "mode": "affine"
559
  },
560
  "model.layers.16.mlp.switch_mlp.gate_proj": {
 
573
  "mode": "mxfp4"
574
  },
575
  "model.layers.17.linear_attn.in_proj_qkvz": {
576
+ "group_size": 64,
577
+ "bits": 8,
578
+ "mode": "affine"
579
  },
580
  "model.layers.17.linear_attn.in_proj_ba": {
581
  "group_size": 64,
582
+ "bits": 8,
583
  "mode": "affine"
584
  },
585
  "model.layers.17.linear_attn.out_proj": {
586
  "group_size": 64,
587
+ "bits": 8,
588
  "mode": "affine"
589
  },
590
  "model.layers.17.mlp.switch_mlp.gate_proj": {
 
603
  "mode": "mxfp4"
604
  },
605
  "model.layers.18.linear_attn.in_proj_qkvz": {
606
+ "group_size": 64,
607
+ "bits": 8,
608
+ "mode": "affine"
609
  },
610
  "model.layers.18.linear_attn.in_proj_ba": {
611
  "group_size": 64,
612
+ "bits": 8,
613
  "mode": "affine"
614
  },
615
  "model.layers.18.linear_attn.out_proj": {
616
  "group_size": 64,
617
+ "bits": 8,
618
  "mode": "affine"
619
  },
620
  "model.layers.18.mlp.switch_mlp.gate_proj": {
 
633
  "mode": "mxfp4"
634
  },
635
  "model.layers.19.self_attn.q_proj": {
636
+ "group_size": 64,
637
+ "bits": 8,
638
+ "mode": "affine"
639
  },
640
  "model.layers.19.self_attn.k_proj": {
641
+ "group_size": 64,
642
+ "bits": 8,
643
+ "mode": "affine"
644
  },
645
  "model.layers.19.self_attn.v_proj": {
646
+ "group_size": 64,
647
+ "bits": 8,
648
+ "mode": "affine"
649
  },
650
  "model.layers.19.self_attn.o_proj": {
651
+ "group_size": 64,
652
+ "bits": 8,
653
+ "mode": "affine"
654
  },
655
  "model.layers.19.mlp.switch_mlp.gate_proj": {
656
  "group_size": 32,
 
668
  "mode": "mxfp4"
669
  },
670
  "model.layers.20.linear_attn.in_proj_qkvz": {
671
+ "group_size": 64,
672
+ "bits": 8,
673
+ "mode": "affine"
674
  },
675
  "model.layers.20.linear_attn.in_proj_ba": {
676
  "group_size": 64,
677
+ "bits": 8,
678
  "mode": "affine"
679
  },
680
  "model.layers.20.linear_attn.out_proj": {
681
  "group_size": 64,
682
+ "bits": 8,
683
  "mode": "affine"
684
  },
685
  "model.layers.20.mlp.switch_mlp.gate_proj": {
 
698
  "mode": "mxfp4"
699
  },
700
  "model.layers.21.linear_attn.in_proj_qkvz": {
701
+ "group_size": 64,
702
+ "bits": 8,
703
+ "mode": "affine"
704
  },
705
  "model.layers.21.linear_attn.in_proj_ba": {
706
  "group_size": 64,
707
+ "bits": 8,
708
  "mode": "affine"
709
  },
710
  "model.layers.21.linear_attn.out_proj": {
711
  "group_size": 64,
712
+ "bits": 8,
713
  "mode": "affine"
714
  },
715
  "model.layers.21.mlp.switch_mlp.gate_proj": {
 
728
  "mode": "mxfp4"
729
  },
730
  "model.layers.22.linear_attn.in_proj_qkvz": {
731
+ "group_size": 64,
732
+ "bits": 8,
733
+ "mode": "affine"
734
  },
735
  "model.layers.22.linear_attn.in_proj_ba": {
736
  "group_size": 64,
737
+ "bits": 8,
738
  "mode": "affine"
739
  },
740
  "model.layers.22.linear_attn.out_proj": {
741
  "group_size": 64,
742
+ "bits": 8,
743
  "mode": "affine"
744
  },
745
  "model.layers.22.mlp.switch_mlp.gate_proj": {
 
753
  "mode": "mxfp4"
754
  },
755
  "model.layers.22.mlp.switch_mlp.down_proj": {
 
 
 
 
 
756
  "group_size": 32,
757
  "bits": 4,
758
  "mode": "mxfp4"
759
  },
760
+ "model.layers.23.self_attn.q_proj": {
761
+ "group_size": 64,
762
+ "bits": 8,
763
+ "mode": "affine"
764
+ },
765
  "model.layers.23.self_attn.k_proj": {
766
+ "group_size": 64,
767
+ "bits": 8,
768
+ "mode": "affine"
769
  },
770
  "model.layers.23.self_attn.v_proj": {
771
+ "group_size": 64,
772
+ "bits": 8,
773
+ "mode": "affine"
774
  },
775
  "model.layers.23.self_attn.o_proj": {
776
+ "group_size": 64,
777
+ "bits": 8,
778
+ "mode": "affine"
779
  },
780
  "model.layers.23.mlp.switch_mlp.gate_proj": {
781
  "group_size": 32,
 
793
  "mode": "mxfp4"
794
  },
795
  "model.layers.24.linear_attn.in_proj_qkvz": {
796
+ "group_size": 64,
797
+ "bits": 8,
798
+ "mode": "affine"
799
  },
800
  "model.layers.24.linear_attn.in_proj_ba": {
801
  "group_size": 64,
802
+ "bits": 8,
803
  "mode": "affine"
804
  },
805
  "model.layers.24.linear_attn.out_proj": {
806
  "group_size": 64,
807
+ "bits": 8,
808
  "mode": "affine"
809
  },
810
  "model.layers.24.mlp.switch_mlp.gate_proj": {
 
823
  "mode": "mxfp4"
824
  },
825
  "model.layers.25.linear_attn.in_proj_qkvz": {
826
+ "group_size": 64,
827
+ "bits": 8,
828
+ "mode": "affine"
829
  },
830
  "model.layers.25.linear_attn.in_proj_ba": {
831
  "group_size": 64,
832
+ "bits": 8,
833
  "mode": "affine"
834
  },
835
  "model.layers.25.linear_attn.out_proj": {
836
  "group_size": 64,
837
+ "bits": 8,
838
  "mode": "affine"
839
  },
840
  "model.layers.25.mlp.switch_mlp.gate_proj": {
 
853
  "mode": "mxfp4"
854
  },
855
  "model.layers.26.linear_attn.in_proj_qkvz": {
856
+ "group_size": 64,
857
+ "bits": 8,
858
+ "mode": "affine"
859
  },
860
  "model.layers.26.linear_attn.in_proj_ba": {
861
  "group_size": 64,
862
+ "bits": 8,
863
  "mode": "affine"
864
  },
865
  "model.layers.26.linear_attn.out_proj": {
866
  "group_size": 64,
867
+ "bits": 8,
868
  "mode": "affine"
869
  },
870
  "model.layers.26.mlp.switch_mlp.gate_proj": {
 
883
  "mode": "mxfp4"
884
  },
885
  "model.layers.27.self_attn.q_proj": {
886
+ "group_size": 64,
887
+ "bits": 8,
888
+ "mode": "affine"
889
  },
890
  "model.layers.27.self_attn.k_proj": {
891
+ "group_size": 64,
892
+ "bits": 8,
893
+ "mode": "affine"
894
  },
895
  "model.layers.27.self_attn.v_proj": {
896
+ "group_size": 64,
897
+ "bits": 8,
898
+ "mode": "affine"
899
  },
900
  "model.layers.27.self_attn.o_proj": {
901
+ "group_size": 64,
902
+ "bits": 8,
903
+ "mode": "affine"
904
  },
905
  "model.layers.27.mlp.switch_mlp.gate_proj": {
906
  "group_size": 32,
 
918
  "mode": "mxfp4"
919
  },
920
  "model.layers.28.linear_attn.in_proj_qkvz": {
921
+ "group_size": 64,
922
+ "bits": 8,
923
+ "mode": "affine"
924
  },
925
  "model.layers.28.linear_attn.in_proj_ba": {
926
  "group_size": 64,
927
+ "bits": 8,
928
  "mode": "affine"
929
  },
930
  "model.layers.28.linear_attn.out_proj": {
931
  "group_size": 64,
932
+ "bits": 8,
933
  "mode": "affine"
934
  },
935
  "model.layers.28.mlp.switch_mlp.gate_proj": {
 
948
  "mode": "mxfp4"
949
  },
950
  "model.layers.29.linear_attn.in_proj_qkvz": {
951
+ "group_size": 64,
952
+ "bits": 8,
953
+ "mode": "affine"
954
  },
955
  "model.layers.29.linear_attn.in_proj_ba": {
956
  "group_size": 64,
957
+ "bits": 8,
958
  "mode": "affine"
959
  },
960
  "model.layers.29.linear_attn.out_proj": {
961
  "group_size": 64,
962
+ "bits": 8,
963
  "mode": "affine"
964
  },
965
  "model.layers.29.mlp.switch_mlp.gate_proj": {
 
978
  "mode": "mxfp4"
979
  },
980
  "model.layers.30.linear_attn.in_proj_qkvz": {
981
+ "group_size": 64,
982
+ "bits": 8,
983
+ "mode": "affine"
984
  },
985
  "model.layers.30.linear_attn.in_proj_ba": {
986
  "group_size": 64,
987
+ "bits": 8,
988
  "mode": "affine"
989
  },
990
  "model.layers.30.linear_attn.out_proj": {
991
  "group_size": 64,
992
+ "bits": 8,
993
  "mode": "affine"
994
  },
995
  "model.layers.30.mlp.switch_mlp.gate_proj": {
 
1008
  "mode": "mxfp4"
1009
  },
1010
  "model.layers.31.self_attn.q_proj": {
1011
+ "group_size": 64,
1012
+ "bits": 8,
1013
+ "mode": "affine"
1014
  },
1015
  "model.layers.31.self_attn.k_proj": {
1016
+ "group_size": 64,
1017
+ "bits": 8,
1018
+ "mode": "affine"
1019
  },
1020
  "model.layers.31.self_attn.v_proj": {
1021
+ "group_size": 64,
1022
+ "bits": 8,
1023
+ "mode": "affine"
1024
  },
1025
  "model.layers.31.self_attn.o_proj": {
1026
+ "group_size": 64,
1027
+ "bits": 8,
1028
+ "mode": "affine"
1029
  },
1030
  "model.layers.31.mlp.switch_mlp.gate_proj": {
1031
  "group_size": 32,
 
1038
  "mode": "mxfp4"
1039
  },
1040
  "model.layers.31.mlp.switch_mlp.down_proj": {
 
 
 
 
 
1041
  "group_size": 32,
1042
  "bits": 4,
1043
  "mode": "mxfp4"
1044
  },
1045
+ "model.layers.32.linear_attn.in_proj_qkvz": {
1046
+ "group_size": 64,
1047
+ "bits": 8,
1048
+ "mode": "affine"
1049
+ },
1050
  "model.layers.32.linear_attn.in_proj_ba": {
1051
  "group_size": 64,
1052
+ "bits": 8,
1053
  "mode": "affine"
1054
  },
1055
  "model.layers.32.linear_attn.out_proj": {
1056
  "group_size": 64,
1057
+ "bits": 8,
1058
  "mode": "affine"
1059
  },
1060
  "model.layers.32.mlp.switch_mlp.gate_proj": {
 
1073
  "mode": "mxfp4"
1074
  },
1075
  "model.layers.33.linear_attn.in_proj_qkvz": {
1076
+ "group_size": 64,
1077
+ "bits": 8,
1078
+ "mode": "affine"
1079
  },
1080
  "model.layers.33.linear_attn.in_proj_ba": {
1081
  "group_size": 64,
1082
+ "bits": 8,
1083
  "mode": "affine"
1084
  },
1085
  "model.layers.33.linear_attn.out_proj": {
1086
  "group_size": 64,
1087
+ "bits": 8,
1088
  "mode": "affine"
1089
  },
1090
  "model.layers.33.mlp.switch_mlp.gate_proj": {
 
1103
  "mode": "mxfp4"
1104
  },
1105
  "model.layers.34.linear_attn.in_proj_qkvz": {
1106
+ "group_size": 64,
1107
+ "bits": 8,
1108
+ "mode": "affine"
1109
  },
1110
  "model.layers.34.linear_attn.in_proj_ba": {
1111
  "group_size": 64,
1112
+ "bits": 8,
1113
  "mode": "affine"
1114
  },
1115
  "model.layers.34.linear_attn.out_proj": {
1116
  "group_size": 64,
1117
+ "bits": 8,
1118
  "mode": "affine"
1119
  },
1120
  "model.layers.34.mlp.switch_mlp.gate_proj": {
 
1133
  "mode": "mxfp4"
1134
  },
1135
  "model.layers.35.self_attn.q_proj": {
1136
+ "group_size": 64,
1137
+ "bits": 8,
1138
+ "mode": "affine"
1139
  },
1140
  "model.layers.35.self_attn.k_proj": {
1141
+ "group_size": 64,
1142
+ "bits": 8,
1143
+ "mode": "affine"
1144
  },
1145
  "model.layers.35.self_attn.v_proj": {
1146
+ "group_size": 64,
1147
+ "bits": 8,
1148
+ "mode": "affine"
1149
  },
1150
  "model.layers.35.self_attn.o_proj": {
1151
+ "group_size": 64,
1152
+ "bits": 8,
1153
+ "mode": "affine"
1154
  },
1155
  "model.layers.35.mlp.switch_mlp.gate_proj": {
1156
  "group_size": 32,
 
1168
  "mode": "mxfp4"
1169
  },
1170
  "model.layers.36.linear_attn.in_proj_qkvz": {
1171
+ "group_size": 64,
1172
+ "bits": 8,
1173
+ "mode": "affine"
1174
  },
1175
  "model.layers.36.linear_attn.in_proj_ba": {
1176
  "group_size": 64,
1177
+ "bits": 8,
1178
  "mode": "affine"
1179
  },
1180
  "model.layers.36.linear_attn.out_proj": {
1181
  "group_size": 64,
1182
+ "bits": 8,
1183
  "mode": "affine"
1184
  },
1185
  "model.layers.36.mlp.switch_mlp.gate_proj": {
 
1198
  "mode": "mxfp4"
1199
  },
1200
  "model.layers.37.linear_attn.in_proj_qkvz": {
1201
+ "group_size": 64,
1202
+ "bits": 8,
1203
+ "mode": "affine"
1204
  },
1205
  "model.layers.37.linear_attn.in_proj_ba": {
1206
  "group_size": 64,
1207
+ "bits": 8,
1208
  "mode": "affine"
1209
  },
1210
  "model.layers.37.linear_attn.out_proj": {
1211
  "group_size": 64,
1212
+ "bits": 8,
1213
  "mode": "affine"
1214
  },
1215
  "model.layers.37.mlp.switch_mlp.gate_proj": {
 
1228
  "mode": "mxfp4"
1229
  },
1230
  "model.layers.38.linear_attn.in_proj_qkvz": {
1231
+ "group_size": 64,
1232
+ "bits": 8,
1233
+ "mode": "affine"
1234
  },
1235
  "model.layers.38.linear_attn.in_proj_ba": {
1236
  "group_size": 64,
1237
+ "bits": 8,
1238
  "mode": "affine"
1239
  },
1240
  "model.layers.38.linear_attn.out_proj": {
1241
  "group_size": 64,
1242
+ "bits": 8,
1243
  "mode": "affine"
1244
  },
1245
  "model.layers.38.mlp.switch_mlp.gate_proj": {
 
1258
  "mode": "mxfp4"
1259
  },
1260
  "model.layers.39.self_attn.q_proj": {
1261
+ "group_size": 64,
1262
+ "bits": 8,
1263
+ "mode": "affine"
1264
  },
1265
  "model.layers.39.self_attn.k_proj": {
1266
+ "group_size": 64,
1267
+ "bits": 8,
1268
+ "mode": "affine"
1269
  },
1270
  "model.layers.39.self_attn.v_proj": {
1271
+ "group_size": 64,
1272
+ "bits": 8,
1273
+ "mode": "affine"
1274
  },
1275
  "model.layers.39.self_attn.o_proj": {
1276
+ "group_size": 64,
1277
+ "bits": 8,
1278
+ "mode": "affine"
1279
  },
1280
  "model.layers.39.mlp.switch_mlp.gate_proj": {
1281
  "group_size": 32,
 
1288
  "mode": "mxfp4"
1289
  },
1290
  "model.layers.39.mlp.switch_mlp.down_proj": {
 
 
 
 
 
1291
  "group_size": 32,
1292
  "bits": 4,
1293
  "mode": "mxfp4"
1294
  },
1295
+ "model.layers.40.linear_attn.in_proj_qkvz": {
1296
+ "group_size": 64,
1297
+ "bits": 8,
1298
+ "mode": "affine"
1299
+ },
1300
  "model.layers.40.linear_attn.in_proj_ba": {
1301
  "group_size": 64,
1302
+ "bits": 8,
1303
  "mode": "affine"
1304
  },
1305
  "model.layers.40.linear_attn.out_proj": {
1306
  "group_size": 64,
1307
+ "bits": 8,
1308
  "mode": "affine"
1309
  },
1310
  "model.layers.40.mlp.switch_mlp.gate_proj": {
 
1323
  "mode": "mxfp4"
1324
  },
1325
  "model.layers.41.linear_attn.in_proj_qkvz": {
1326
+ "group_size": 64,
1327
+ "bits": 8,
1328
+ "mode": "affine"
1329
  },
1330
  "model.layers.41.linear_attn.in_proj_ba": {
1331
  "group_size": 64,
1332
+ "bits": 8,
1333
  "mode": "affine"
1334
  },
1335
  "model.layers.41.linear_attn.out_proj": {
1336
  "group_size": 64,
1337
+ "bits": 8,
1338
  "mode": "affine"
1339
  },
1340
  "model.layers.41.mlp.switch_mlp.gate_proj": {
 
1353
  "mode": "mxfp4"
1354
  },
1355
  "model.layers.42.linear_attn.in_proj_qkvz": {
1356
+ "group_size": 64,
1357
+ "bits": 8,
1358
+ "mode": "affine"
1359
  },
1360
  "model.layers.42.linear_attn.in_proj_ba": {
1361
  "group_size": 64,
1362
+ "bits": 8,
1363
  "mode": "affine"
1364
  },
1365
  "model.layers.42.linear_attn.out_proj": {
1366
  "group_size": 64,
1367
+ "bits": 8,
1368
  "mode": "affine"
1369
  },
1370
  "model.layers.42.mlp.switch_mlp.gate_proj": {
 
1378
  "mode": "mxfp4"
1379
  },
1380
  "model.layers.42.mlp.switch_mlp.down_proj": {
1381
+ "group_size": 32,
1382
+ "bits": 4,
1383
+ "mode": "mxfp4"
1384
  },
1385
  "model.layers.43.self_attn.q_proj": {
1386
  "group_size": 64,
1387
+ "bits": 8,
1388
  "mode": "affine"
1389
  },
1390
  "model.layers.43.self_attn.k_proj": {
1391
  "group_size": 64,
1392
+ "bits": 8,
1393
  "mode": "affine"
1394
  },
1395
  "model.layers.43.self_attn.v_proj": {
1396
  "group_size": 64,
1397
+ "bits": 8,
1398
  "mode": "affine"
1399
  },
1400
  "model.layers.43.self_attn.o_proj": {
1401
  "group_size": 64,
1402
+ "bits": 8,
1403
  "mode": "affine"
1404
  },
1405
  "model.layers.43.mlp.switch_mlp.gate_proj": {
 
1413
  "mode": "mxfp4"
1414
  },
1415
  "model.layers.43.mlp.switch_mlp.down_proj": {
 
 
 
 
 
1416
  "group_size": 32,
1417
  "bits": 4,
1418
  "mode": "mxfp4"
1419
  },
1420
+ "model.layers.44.linear_attn.in_proj_qkvz": {
1421
+ "group_size": 64,
1422
+ "bits": 8,
1423
+ "mode": "affine"
1424
+ },
1425
  "model.layers.44.linear_attn.in_proj_ba": {
1426
  "group_size": 64,
1427
+ "bits": 8,
1428
  "mode": "affine"
1429
  },
1430
  "model.layers.44.linear_attn.out_proj": {
1431
  "group_size": 64,
1432
+ "bits": 8,
1433
  "mode": "affine"
1434
  },
1435
  "model.layers.44.mlp.switch_mlp.gate_proj": {
 
1449
  },
1450
  "model.layers.45.linear_attn.in_proj_qkvz": {
1451
  "group_size": 64,
1452
+ "bits": 8,
1453
  "mode": "affine"
1454
  },
1455
  "model.layers.45.linear_attn.in_proj_ba": {
1456
  "group_size": 64,
1457
+ "bits": 8,
1458
  "mode": "affine"
1459
  },
1460
  "model.layers.45.linear_attn.out_proj": {
 
1473
  "mode": "mxfp4"
1474
  },
1475
  "model.layers.45.mlp.switch_mlp.down_proj": {
1476
+ "group_size": 32,
1477
+ "bits": 4,
1478
+ "mode": "mxfp4"
1479
  },
1480
  "model.layers.46.linear_attn.in_proj_qkvz": {
1481
  "group_size": 64,
1482
+ "bits": 8,
1483
  "mode": "affine"
1484
  },
1485
  "model.layers.46.linear_attn.in_proj_ba": {
1486
  "group_size": 64,
1487
+ "bits": 8,
1488
  "mode": "affine"
1489
  },
1490
  "model.layers.46.linear_attn.out_proj": {
 
1503
  "mode": "mxfp4"
1504
  },
1505
  "model.layers.46.mlp.switch_mlp.down_proj": {
1506
+ "group_size": 32,
1507
+ "bits": 4,
1508
+ "mode": "mxfp4"
1509
  },
1510
  "model.layers.47.self_attn.q_proj": {
1511
  "group_size": 64,
1512
+ "bits": 8,
1513
  "mode": "affine"
1514
  },
1515
  "model.layers.47.self_attn.k_proj": {
1516
  "group_size": 64,
1517
+ "bits": 8,
1518
  "mode": "affine"
1519
  },
1520
  "model.layers.47.self_attn.v_proj": {
1521
  "group_size": 64,
1522
+ "bits": 8,
1523
  "mode": "affine"
1524
  },
1525
  "model.layers.47.self_attn.o_proj": {
1526
  "group_size": 64,
1527
+ "bits": 8,
1528
  "mode": "affine"
1529
  },
1530
  "model.layers.47.mlp.switch_mlp.gate_proj": {
 
1538
  "mode": "mxfp4"
1539
  },
1540
  "model.layers.47.mlp.switch_mlp.down_proj": {
1541
+ "group_size": 32,
1542
+ "bits": 4,
1543
+ "mode": "mxfp4"
1544
  },
1545
  "lm_head": {
1546
  "group_size": 64,
1547
+ "bits": 8,
1548
  "mode": "affine"
1549
  }
1550
  },
 
1554
  "mode": "affine",
1555
  "model.embed_tokens": {
1556
  "group_size": 64,
1557
+ "bits": 8,
1558
  "mode": "affine"
1559
  },
1560
  "model.layers.0.linear_attn.in_proj_qkvz": {
1561
+ "group_size": 64,
1562
+ "bits": 8,
1563
+ "mode": "affine"
1564
  },
1565
  "model.layers.0.linear_attn.in_proj_ba": {
1566
  "group_size": 64,
1567
+ "bits": 8,
1568
  "mode": "affine"
1569
  },
1570
  "model.layers.0.linear_attn.out_proj": {
1571
  "group_size": 64,
1572
+ "bits": 8,
1573
  "mode": "affine"
1574
  },
1575
  "model.layers.0.mlp.switch_mlp.gate_proj": {
 
1588
  "mode": "mxfp4"
1589
  },
1590
  "model.layers.1.linear_attn.in_proj_qkvz": {
1591
+ "group_size": 64,
1592
+ "bits": 8,
1593
+ "mode": "affine"
1594
  },
1595
  "model.layers.1.linear_attn.in_proj_ba": {
1596
  "group_size": 64,
1597
+ "bits": 8,
1598
  "mode": "affine"
1599
  },
1600
  "model.layers.1.linear_attn.out_proj": {
1601
  "group_size": 64,
1602
+ "bits": 8,
1603
  "mode": "affine"
1604
  },
1605
  "model.layers.1.mlp.switch_mlp.gate_proj": {
 
1613
  "mode": "mxfp4"
1614
  },
1615
  "model.layers.1.mlp.switch_mlp.down_proj": {
 
 
 
 
 
1616
  "group_size": 32,
1617
  "bits": 4,
1618
  "mode": "mxfp4"
1619
  },
1620
+ "model.layers.2.linear_attn.in_proj_qkvz": {
1621
+ "group_size": 64,
1622
+ "bits": 8,
1623
+ "mode": "affine"
1624
+ },
1625
  "model.layers.2.linear_attn.in_proj_ba": {
1626
  "group_size": 64,
1627
+ "bits": 8,
1628
  "mode": "affine"
1629
  },
1630
  "model.layers.2.linear_attn.out_proj": {
1631
  "group_size": 64,
1632
+ "bits": 8,
1633
  "mode": "affine"
1634
  },
1635
  "model.layers.2.mlp.switch_mlp.gate_proj": {
 
1648
  "mode": "mxfp4"
1649
  },
1650
  "model.layers.3.self_attn.q_proj": {
1651
+ "group_size": 64,
1652
+ "bits": 8,
1653
+ "mode": "affine"
1654
  },
1655
  "model.layers.3.self_attn.k_proj": {
1656
+ "group_size": 64,
1657
+ "bits": 8,
1658
+ "mode": "affine"
1659
  },
1660
  "model.layers.3.self_attn.v_proj": {
1661
+ "group_size": 64,
1662
+ "bits": 8,
1663
+ "mode": "affine"
1664
  },
1665
  "model.layers.3.self_attn.o_proj": {
1666
+ "group_size": 64,
1667
+ "bits": 8,
1668
+ "mode": "affine"
1669
  },
1670
  "model.layers.3.mlp.switch_mlp.gate_proj": {
1671
  "group_size": 32,
 
1683
  "mode": "mxfp4"
1684
  },
1685
  "model.layers.4.linear_attn.in_proj_qkvz": {
1686
+ "group_size": 64,
1687
+ "bits": 8,
1688
+ "mode": "affine"
1689
  },
1690
  "model.layers.4.linear_attn.in_proj_ba": {
1691
  "group_size": 64,
1692
+ "bits": 8,
1693
  "mode": "affine"
1694
  },
1695
  "model.layers.4.linear_attn.out_proj": {
1696
  "group_size": 64,
1697
+ "bits": 8,
1698
  "mode": "affine"
1699
  },
1700
  "model.layers.4.mlp.switch_mlp.gate_proj": {
 
1713
  "mode": "mxfp4"
1714
  },
1715
  "model.layers.5.linear_attn.in_proj_qkvz": {
1716
+ "group_size": 64,
1717
+ "bits": 8,
1718
+ "mode": "affine"
1719
  },
1720
  "model.layers.5.linear_attn.in_proj_ba": {
1721
  "group_size": 64,
1722
+ "bits": 8,
1723
  "mode": "affine"
1724
  },
1725
  "model.layers.5.linear_attn.out_proj": {
1726
  "group_size": 64,
1727
+ "bits": 8,
1728
  "mode": "affine"
1729
  },
1730
  "model.layers.5.mlp.switch_mlp.gate_proj": {
 
1743
  "mode": "mxfp4"
1744
  },
1745
  "model.layers.6.linear_attn.in_proj_qkvz": {
1746
+ "group_size": 64,
1747
+ "bits": 8,
1748
+ "mode": "affine"
1749
  },
1750
  "model.layers.6.linear_attn.in_proj_ba": {
1751
  "group_size": 64,
1752
+ "bits": 8,
1753
  "mode": "affine"
1754
  },
1755
  "model.layers.6.linear_attn.out_proj": {
1756
  "group_size": 64,
1757
+ "bits": 8,
1758
  "mode": "affine"
1759
  },
1760
  "model.layers.6.mlp.switch_mlp.gate_proj": {
 
1768
  "mode": "mxfp4"
1769
  },
1770
  "model.layers.6.mlp.switch_mlp.down_proj": {
 
 
 
 
 
1771
  "group_size": 32,
1772
  "bits": 4,
1773
  "mode": "mxfp4"
1774
  },
1775
+ "model.layers.7.self_attn.q_proj": {
1776
+ "group_size": 64,
1777
+ "bits": 8,
1778
+ "mode": "affine"
1779
+ },
1780
  "model.layers.7.self_attn.k_proj": {
1781
+ "group_size": 64,
1782
+ "bits": 8,
1783
+ "mode": "affine"
1784
  },
1785
  "model.layers.7.self_attn.v_proj": {
1786
+ "group_size": 64,
1787
+ "bits": 8,
1788
+ "mode": "affine"
1789
  },
1790
  "model.layers.7.self_attn.o_proj": {
1791
+ "group_size": 64,
1792
+ "bits": 8,
1793
+ "mode": "affine"
1794
  },
1795
  "model.layers.7.mlp.switch_mlp.gate_proj": {
1796
  "group_size": 32,
 
1808
  "mode": "mxfp4"
1809
  },
1810
  "model.layers.8.linear_attn.in_proj_qkvz": {
1811
+ "group_size": 64,
1812
+ "bits": 8,
1813
+ "mode": "affine"
1814
  },
1815
  "model.layers.8.linear_attn.in_proj_ba": {
1816
  "group_size": 64,
1817
+ "bits": 8,
1818
  "mode": "affine"
1819
  },
1820
  "model.layers.8.linear_attn.out_proj": {
1821
  "group_size": 64,
1822
+ "bits": 8,
1823
  "mode": "affine"
1824
  },
1825
  "model.layers.8.mlp.switch_mlp.gate_proj": {
 
1838
  "mode": "mxfp4"
1839
  },
1840
  "model.layers.9.linear_attn.in_proj_qkvz": {
1841
+ "group_size": 64,
1842
+ "bits": 8,
1843
+ "mode": "affine"
1844
  },
1845
  "model.layers.9.linear_attn.in_proj_ba": {
1846
  "group_size": 64,
1847
+ "bits": 8,
1848
  "mode": "affine"
1849
  },
1850
  "model.layers.9.linear_attn.out_proj": {
1851
  "group_size": 64,
1852
+ "bits": 8,
1853
  "mode": "affine"
1854
  },
1855
  "model.layers.9.mlp.switch_mlp.gate_proj": {
 
1868
  "mode": "mxfp4"
1869
  },
1870
  "model.layers.10.linear_attn.in_proj_qkvz": {
1871
+ "group_size": 64,
1872
+ "bits": 8,
1873
+ "mode": "affine"
1874
  },
1875
  "model.layers.10.linear_attn.in_proj_ba": {
1876
  "group_size": 64,
1877
+ "bits": 8,
1878
  "mode": "affine"
1879
  },
1880
  "model.layers.10.linear_attn.out_proj": {
1881
  "group_size": 64,
1882
+ "bits": 8,
1883
  "mode": "affine"
1884
  },
1885
  "model.layers.10.mlp.switch_mlp.gate_proj": {
 
1893
  "mode": "mxfp4"
1894
  },
1895
  "model.layers.10.mlp.switch_mlp.down_proj": {
 
 
 
 
 
1896
  "group_size": 32,
1897
  "bits": 4,
1898
  "mode": "mxfp4"
1899
  },
1900
+ "model.layers.11.self_attn.q_proj": {
1901
+ "group_size": 64,
1902
+ "bits": 8,
1903
+ "mode": "affine"
1904
+ },
1905
  "model.layers.11.self_attn.k_proj": {
1906
+ "group_size": 64,
1907
+ "bits": 8,
1908
+ "mode": "affine"
1909
  },
1910
  "model.layers.11.self_attn.v_proj": {
1911
+ "group_size": 64,
1912
+ "bits": 8,
1913
+ "mode": "affine"
1914
  },
1915
  "model.layers.11.self_attn.o_proj": {
1916
+ "group_size": 64,
1917
+ "bits": 8,
1918
+ "mode": "affine"
1919
  },
1920
  "model.layers.11.mlp.switch_mlp.gate_proj": {
1921
  "group_size": 32,
 
1933
  "mode": "mxfp4"
1934
  },
1935
  "model.layers.12.linear_attn.in_proj_qkvz": {
1936
+ "group_size": 64,
1937
+ "bits": 8,
1938
+ "mode": "affine"
1939
  },
1940
  "model.layers.12.linear_attn.in_proj_ba": {
1941
  "group_size": 64,
1942
+ "bits": 8,
1943
  "mode": "affine"
1944
  },
1945
  "model.layers.12.linear_attn.out_proj": {
1946
  "group_size": 64,
1947
+ "bits": 8,
1948
  "mode": "affine"
1949
  },
1950
  "model.layers.12.mlp.switch_mlp.gate_proj": {
 
1963
  "mode": "mxfp4"
1964
  },
1965
  "model.layers.13.linear_attn.in_proj_qkvz": {
1966
+ "group_size": 64,
1967
+ "bits": 8,
1968
+ "mode": "affine"
1969
  },
1970
  "model.layers.13.linear_attn.in_proj_ba": {
1971
  "group_size": 64,
1972
+ "bits": 8,
1973
  "mode": "affine"
1974
  },
1975
  "model.layers.13.linear_attn.out_proj": {
1976
  "group_size": 64,
1977
+ "bits": 8,
1978
  "mode": "affine"
1979
  },
1980
  "model.layers.13.mlp.switch_mlp.gate_proj": {
 
1993
  "mode": "mxfp4"
1994
  },
1995
  "model.layers.14.linear_attn.in_proj_qkvz": {
1996
+ "group_size": 64,
1997
+ "bits": 8,
1998
+ "mode": "affine"
1999
  },
2000
  "model.layers.14.linear_attn.in_proj_ba": {
2001
  "group_size": 64,
2002
+ "bits": 8,
2003
  "mode": "affine"
2004
  },
2005
  "model.layers.14.linear_attn.out_proj": {
2006
  "group_size": 64,
2007
+ "bits": 8,
2008
  "mode": "affine"
2009
  },
2010
  "model.layers.14.mlp.switch_mlp.gate_proj": {
 
2018
  "mode": "mxfp4"
2019
  },
2020
  "model.layers.14.mlp.switch_mlp.down_proj": {
 
 
 
 
 
2021
  "group_size": 32,
2022
  "bits": 4,
2023
  "mode": "mxfp4"
2024
  },
2025
+ "model.layers.15.self_attn.q_proj": {
2026
+ "group_size": 64,
2027
+ "bits": 8,
2028
+ "mode": "affine"
2029
+ },
2030
  "model.layers.15.self_attn.k_proj": {
2031
+ "group_size": 64,
2032
+ "bits": 8,
2033
+ "mode": "affine"
2034
  },
2035
  "model.layers.15.self_attn.v_proj": {
2036
+ "group_size": 64,
2037
+ "bits": 8,
2038
+ "mode": "affine"
2039
  },
2040
  "model.layers.15.self_attn.o_proj": {
2041
+ "group_size": 64,
2042
+ "bits": 8,
2043
+ "mode": "affine"
2044
  },
2045
  "model.layers.15.mlp.switch_mlp.gate_proj": {
2046
  "group_size": 32,
 
2058
  "mode": "mxfp4"
2059
  },
2060
  "model.layers.16.linear_attn.in_proj_qkvz": {
2061
+ "group_size": 64,
2062
+ "bits": 8,
2063
+ "mode": "affine"
2064
  },
2065
  "model.layers.16.linear_attn.in_proj_ba": {
2066
  "group_size": 64,
2067
+ "bits": 8,
2068
  "mode": "affine"
2069
  },
2070
  "model.layers.16.linear_attn.out_proj": {
2071
  "group_size": 64,
2072
+ "bits": 8,
2073
  "mode": "affine"
2074
  },
2075
  "model.layers.16.mlp.switch_mlp.gate_proj": {
 
2088
  "mode": "mxfp4"
2089
  },
2090
  "model.layers.17.linear_attn.in_proj_qkvz": {
2091
+ "group_size": 64,
2092
+ "bits": 8,
2093
+ "mode": "affine"
2094
  },
2095
  "model.layers.17.linear_attn.in_proj_ba": {
2096
  "group_size": 64,
2097
+ "bits": 8,
2098
  "mode": "affine"
2099
  },
2100
  "model.layers.17.linear_attn.out_proj": {
2101
  "group_size": 64,
2102
+ "bits": 8,
2103
  "mode": "affine"
2104
  },
2105
  "model.layers.17.mlp.switch_mlp.gate_proj": {
 
2118
  "mode": "mxfp4"
2119
  },
2120
  "model.layers.18.linear_attn.in_proj_qkvz": {
2121
+ "group_size": 64,
2122
+ "bits": 8,
2123
+ "mode": "affine"
2124
  },
2125
  "model.layers.18.linear_attn.in_proj_ba": {
2126
  "group_size": 64,
2127
+ "bits": 8,
2128
  "mode": "affine"
2129
  },
2130
  "model.layers.18.linear_attn.out_proj": {
2131
  "group_size": 64,
2132
+ "bits": 8,
2133
  "mode": "affine"
2134
  },
2135
  "model.layers.18.mlp.switch_mlp.gate_proj": {
 
2148
  "mode": "mxfp4"
2149
  },
2150
  "model.layers.19.self_attn.q_proj": {
2151
+ "group_size": 64,
2152
+ "bits": 8,
2153
+ "mode": "affine"
2154
  },
2155
  "model.layers.19.self_attn.k_proj": {
2156
+ "group_size": 64,
2157
+ "bits": 8,
2158
+ "mode": "affine"
2159
  },
2160
  "model.layers.19.self_attn.v_proj": {
2161
+ "group_size": 64,
2162
+ "bits": 8,
2163
+ "mode": "affine"
2164
  },
2165
  "model.layers.19.self_attn.o_proj": {
2166
+ "group_size": 64,
2167
+ "bits": 8,
2168
+ "mode": "affine"
2169
  },
2170
  "model.layers.19.mlp.switch_mlp.gate_proj": {
2171
  "group_size": 32,
 
2183
  "mode": "mxfp4"
2184
  },
2185
  "model.layers.20.linear_attn.in_proj_qkvz": {
2186
+ "group_size": 64,
2187
+ "bits": 8,
2188
+ "mode": "affine"
2189
  },
2190
  "model.layers.20.linear_attn.in_proj_ba": {
2191
  "group_size": 64,
2192
+ "bits": 8,
2193
  "mode": "affine"
2194
  },
2195
  "model.layers.20.linear_attn.out_proj": {
2196
  "group_size": 64,
2197
+ "bits": 8,
2198
  "mode": "affine"
2199
  },
2200
  "model.layers.20.mlp.switch_mlp.gate_proj": {
 
2213
  "mode": "mxfp4"
2214
  },
2215
  "model.layers.21.linear_attn.in_proj_qkvz": {
2216
+ "group_size": 64,
2217
+ "bits": 8,
2218
+ "mode": "affine"
2219
  },
2220
  "model.layers.21.linear_attn.in_proj_ba": {
2221
  "group_size": 64,
2222
+ "bits": 8,
2223
  "mode": "affine"
2224
  },
2225
  "model.layers.21.linear_attn.out_proj": {
2226
  "group_size": 64,
2227
+ "bits": 8,
2228
  "mode": "affine"
2229
  },
2230
  "model.layers.21.mlp.switch_mlp.gate_proj": {
 
2243
  "mode": "mxfp4"
2244
  },
2245
  "model.layers.22.linear_attn.in_proj_qkvz": {
2246
+ "group_size": 64,
2247
+ "bits": 8,
2248
+ "mode": "affine"
2249
  },
2250
  "model.layers.22.linear_attn.in_proj_ba": {
2251
  "group_size": 64,
2252
+ "bits": 8,
2253
  "mode": "affine"
2254
  },
2255
  "model.layers.22.linear_attn.out_proj": {
2256
  "group_size": 64,
2257
+ "bits": 8,
2258
  "mode": "affine"
2259
  },
2260
  "model.layers.22.mlp.switch_mlp.gate_proj": {
 
2268
  "mode": "mxfp4"
2269
  },
2270
  "model.layers.22.mlp.switch_mlp.down_proj": {
 
 
 
 
 
2271
  "group_size": 32,
2272
  "bits": 4,
2273
  "mode": "mxfp4"
2274
  },
2275
+ "model.layers.23.self_attn.q_proj": {
2276
+ "group_size": 64,
2277
+ "bits": 8,
2278
+ "mode": "affine"
2279
+ },
2280
  "model.layers.23.self_attn.k_proj": {
2281
+ "group_size": 64,
2282
+ "bits": 8,
2283
+ "mode": "affine"
2284
  },
2285
  "model.layers.23.self_attn.v_proj": {
2286
+ "group_size": 64,
2287
+ "bits": 8,
2288
+ "mode": "affine"
2289
  },
2290
  "model.layers.23.self_attn.o_proj": {
2291
+ "group_size": 64,
2292
+ "bits": 8,
2293
+ "mode": "affine"
2294
  },
2295
  "model.layers.23.mlp.switch_mlp.gate_proj": {
2296
  "group_size": 32,
 
2308
  "mode": "mxfp4"
2309
  },
2310
  "model.layers.24.linear_attn.in_proj_qkvz": {
2311
+ "group_size": 64,
2312
+ "bits": 8,
2313
+ "mode": "affine"
2314
  },
2315
  "model.layers.24.linear_attn.in_proj_ba": {
2316
  "group_size": 64,
2317
+ "bits": 8,
2318
  "mode": "affine"
2319
  },
2320
  "model.layers.24.linear_attn.out_proj": {
2321
  "group_size": 64,
2322
+ "bits": 8,
2323
  "mode": "affine"
2324
  },
2325
  "model.layers.24.mlp.switch_mlp.gate_proj": {
 
2338
  "mode": "mxfp4"
2339
  },
2340
  "model.layers.25.linear_attn.in_proj_qkvz": {
2341
+ "group_size": 64,
2342
+ "bits": 8,
2343
+ "mode": "affine"
2344
  },
2345
  "model.layers.25.linear_attn.in_proj_ba": {
2346
  "group_size": 64,
2347
+ "bits": 8,
2348
  "mode": "affine"
2349
  },
2350
  "model.layers.25.linear_attn.out_proj": {
2351
  "group_size": 64,
2352
+ "bits": 8,
2353
  "mode": "affine"
2354
  },
2355
  "model.layers.25.mlp.switch_mlp.gate_proj": {
 
2368
  "mode": "mxfp4"
2369
  },
2370
  "model.layers.26.linear_attn.in_proj_qkvz": {
2371
+ "group_size": 64,
2372
+ "bits": 8,
2373
+ "mode": "affine"
2374
  },
2375
  "model.layers.26.linear_attn.in_proj_ba": {
2376
  "group_size": 64,
2377
+ "bits": 8,
2378
  "mode": "affine"
2379
  },
2380
  "model.layers.26.linear_attn.out_proj": {
2381
  "group_size": 64,
2382
+ "bits": 8,
2383
  "mode": "affine"
2384
  },
2385
  "model.layers.26.mlp.switch_mlp.gate_proj": {
 
2392
  "bits": 4,
2393
  "mode": "mxfp4"
2394
  },
2395
+ "model.layers.26.mlp.switch_mlp.down_proj": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2396
  "group_size": 32,
2397
  "bits": 4,
2398
  "mode": "mxfp4"
2399
  },
2400
+ "model.layers.27.self_attn.q_proj": {
2401
+ "group_size": 64,
2402
+ "bits": 8,
2403
+ "mode": "affine"
2404
+ },
2405
+ "model.layers.27.self_attn.k_proj": {
2406
+ "group_size": 64,
2407
+ "bits": 8,
2408
+ "mode": "affine"
2409
+ },
2410
+ "model.layers.27.self_attn.v_proj": {
2411
+ "group_size": 64,
2412
+ "bits": 8,
2413
+ "mode": "affine"
2414
+ },
2415
  "model.layers.27.self_attn.o_proj": {
2416
+ "group_size": 64,
2417
+ "bits": 8,
2418
+ "mode": "affine"
2419
  },
2420
  "model.layers.27.mlp.switch_mlp.gate_proj": {
2421
  "group_size": 32,
 
2433
  "mode": "mxfp4"
2434
  },
2435
  "model.layers.28.linear_attn.in_proj_qkvz": {
2436
+ "group_size": 64,
2437
+ "bits": 8,
2438
+ "mode": "affine"
2439
  },
2440
  "model.layers.28.linear_attn.in_proj_ba": {
2441
  "group_size": 64,
2442
+ "bits": 8,
2443
  "mode": "affine"
2444
  },
2445
  "model.layers.28.linear_attn.out_proj": {
2446
  "group_size": 64,
2447
+ "bits": 8,
2448
  "mode": "affine"
2449
  },
2450
  "model.layers.28.mlp.switch_mlp.gate_proj": {
 
2463
  "mode": "mxfp4"
2464
  },
2465
  "model.layers.29.linear_attn.in_proj_qkvz": {
2466
+ "group_size": 64,
2467
+ "bits": 8,
2468
+ "mode": "affine"
2469
  },
2470
  "model.layers.29.linear_attn.in_proj_ba": {
2471
  "group_size": 64,
2472
+ "bits": 8,
2473
  "mode": "affine"
2474
  },
2475
  "model.layers.29.linear_attn.out_proj": {
2476
  "group_size": 64,
2477
+ "bits": 8,
2478
  "mode": "affine"
2479
  },
2480
  "model.layers.29.mlp.switch_mlp.gate_proj": {
 
2493
  "mode": "mxfp4"
2494
  },
2495
  "model.layers.30.linear_attn.in_proj_qkvz": {
2496
+ "group_size": 64,
2497
+ "bits": 8,
2498
+ "mode": "affine"
2499
  },
2500
  "model.layers.30.linear_attn.in_proj_ba": {
2501
  "group_size": 64,
2502
+ "bits": 8,
2503
  "mode": "affine"
2504
  },
2505
  "model.layers.30.linear_attn.out_proj": {
2506
  "group_size": 64,
2507
+ "bits": 8,
2508
  "mode": "affine"
2509
  },
2510
  "model.layers.30.mlp.switch_mlp.gate_proj": {
 
2523
  "mode": "mxfp4"
2524
  },
2525
  "model.layers.31.self_attn.q_proj": {
2526
+ "group_size": 64,
2527
+ "bits": 8,
2528
+ "mode": "affine"
2529
  },
2530
  "model.layers.31.self_attn.k_proj": {
2531
+ "group_size": 64,
2532
+ "bits": 8,
2533
+ "mode": "affine"
2534
  },
2535
  "model.layers.31.self_attn.v_proj": {
2536
+ "group_size": 64,
2537
+ "bits": 8,
2538
+ "mode": "affine"
2539
  },
2540
  "model.layers.31.self_attn.o_proj": {
2541
+ "group_size": 64,
2542
+ "bits": 8,
2543
+ "mode": "affine"
2544
  },
2545
  "model.layers.31.mlp.switch_mlp.gate_proj": {
2546
  "group_size": 32,
 
2553
  "mode": "mxfp4"
2554
  },
2555
  "model.layers.31.mlp.switch_mlp.down_proj": {
 
 
 
 
 
2556
  "group_size": 32,
2557
  "bits": 4,
2558
  "mode": "mxfp4"
2559
  },
2560
+ "model.layers.32.linear_attn.in_proj_qkvz": {
2561
+ "group_size": 64,
2562
+ "bits": 8,
2563
+ "mode": "affine"
2564
+ },
2565
  "model.layers.32.linear_attn.in_proj_ba": {
2566
  "group_size": 64,
2567
+ "bits": 8,
2568
  "mode": "affine"
2569
  },
2570
  "model.layers.32.linear_attn.out_proj": {
2571
  "group_size": 64,
2572
+ "bits": 8,
2573
  "mode": "affine"
2574
  },
2575
  "model.layers.32.mlp.switch_mlp.gate_proj": {
 
2588
  "mode": "mxfp4"
2589
  },
2590
  "model.layers.33.linear_attn.in_proj_qkvz": {
2591
+ "group_size": 64,
2592
+ "bits": 8,
2593
+ "mode": "affine"
2594
  },
2595
  "model.layers.33.linear_attn.in_proj_ba": {
2596
  "group_size": 64,
2597
+ "bits": 8,
2598
  "mode": "affine"
2599
  },
2600
  "model.layers.33.linear_attn.out_proj": {
2601
  "group_size": 64,
2602
+ "bits": 8,
2603
  "mode": "affine"
2604
  },
2605
  "model.layers.33.mlp.switch_mlp.gate_proj": {
 
2618
  "mode": "mxfp4"
2619
  },
2620
  "model.layers.34.linear_attn.in_proj_qkvz": {
2621
+ "group_size": 64,
2622
+ "bits": 8,
2623
+ "mode": "affine"
2624
  },
2625
  "model.layers.34.linear_attn.in_proj_ba": {
2626
  "group_size": 64,
2627
+ "bits": 8,
2628
  "mode": "affine"
2629
  },
2630
  "model.layers.34.linear_attn.out_proj": {
2631
  "group_size": 64,
2632
+ "bits": 8,
2633
  "mode": "affine"
2634
  },
2635
  "model.layers.34.mlp.switch_mlp.gate_proj": {
 
2648
  "mode": "mxfp4"
2649
  },
2650
  "model.layers.35.self_attn.q_proj": {
2651
+ "group_size": 64,
2652
+ "bits": 8,
2653
+ "mode": "affine"
2654
  },
2655
  "model.layers.35.self_attn.k_proj": {
2656
+ "group_size": 64,
2657
+ "bits": 8,
2658
+ "mode": "affine"
2659
  },
2660
  "model.layers.35.self_attn.v_proj": {
2661
+ "group_size": 64,
2662
+ "bits": 8,
2663
+ "mode": "affine"
2664
  },
2665
  "model.layers.35.self_attn.o_proj": {
2666
+ "group_size": 64,
2667
+ "bits": 8,
2668
+ "mode": "affine"
2669
  },
2670
  "model.layers.35.mlp.switch_mlp.gate_proj": {
2671
  "group_size": 32,
 
2683
  "mode": "mxfp4"
2684
  },
2685
  "model.layers.36.linear_attn.in_proj_qkvz": {
2686
+ "group_size": 64,
2687
+ "bits": 8,
2688
+ "mode": "affine"
2689
  },
2690
  "model.layers.36.linear_attn.in_proj_ba": {
2691
  "group_size": 64,
2692
+ "bits": 8,
2693
  "mode": "affine"
2694
  },
2695
  "model.layers.36.linear_attn.out_proj": {
2696
  "group_size": 64,
2697
+ "bits": 8,
2698
  "mode": "affine"
2699
  },
2700
  "model.layers.36.mlp.switch_mlp.gate_proj": {
 
2713
  "mode": "mxfp4"
2714
  },
2715
  "model.layers.37.linear_attn.in_proj_qkvz": {
2716
+ "group_size": 64,
2717
+ "bits": 8,
2718
+ "mode": "affine"
2719
  },
2720
  "model.layers.37.linear_attn.in_proj_ba": {
2721
  "group_size": 64,
2722
+ "bits": 8,
2723
  "mode": "affine"
2724
  },
2725
  "model.layers.37.linear_attn.out_proj": {
2726
  "group_size": 64,
2727
+ "bits": 8,
2728
  "mode": "affine"
2729
  },
2730
  "model.layers.37.mlp.switch_mlp.gate_proj": {
 
2743
  "mode": "mxfp4"
2744
  },
2745
  "model.layers.38.linear_attn.in_proj_qkvz": {
2746
+ "group_size": 64,
2747
+ "bits": 8,
2748
+ "mode": "affine"
2749
  },
2750
  "model.layers.38.linear_attn.in_proj_ba": {
2751
  "group_size": 64,
2752
+ "bits": 8,
2753
  "mode": "affine"
2754
  },
2755
  "model.layers.38.linear_attn.out_proj": {
2756
  "group_size": 64,
2757
+ "bits": 8,
2758
  "mode": "affine"
2759
  },
2760
  "model.layers.38.mlp.switch_mlp.gate_proj": {
 
2773
  "mode": "mxfp4"
2774
  },
2775
  "model.layers.39.self_attn.q_proj": {
2776
+ "group_size": 64,
2777
+ "bits": 8,
2778
+ "mode": "affine"
2779
  },
2780
  "model.layers.39.self_attn.k_proj": {
2781
+ "group_size": 64,
2782
+ "bits": 8,
2783
+ "mode": "affine"
2784
  },
2785
  "model.layers.39.self_attn.v_proj": {
2786
+ "group_size": 64,
2787
+ "bits": 8,
2788
+ "mode": "affine"
2789
  },
2790
  "model.layers.39.self_attn.o_proj": {
2791
+ "group_size": 64,
2792
+ "bits": 8,
2793
+ "mode": "affine"
2794
  },
2795
  "model.layers.39.mlp.switch_mlp.gate_proj": {
2796
  "group_size": 32,
 
2803
  "mode": "mxfp4"
2804
  },
2805
  "model.layers.39.mlp.switch_mlp.down_proj": {
 
 
 
 
 
2806
  "group_size": 32,
2807
  "bits": 4,
2808
  "mode": "mxfp4"
2809
  },
2810
+ "model.layers.40.linear_attn.in_proj_qkvz": {
2811
+ "group_size": 64,
2812
+ "bits": 8,
2813
+ "mode": "affine"
2814
+ },
2815
  "model.layers.40.linear_attn.in_proj_ba": {
2816
  "group_size": 64,
2817
+ "bits": 8,
2818
  "mode": "affine"
2819
  },
2820
  "model.layers.40.linear_attn.out_proj": {
2821
  "group_size": 64,
2822
+ "bits": 8,
2823
  "mode": "affine"
2824
  },
2825
  "model.layers.40.mlp.switch_mlp.gate_proj": {
 
2838
  "mode": "mxfp4"
2839
  },
2840
  "model.layers.41.linear_attn.in_proj_qkvz": {
2841
+ "group_size": 64,
2842
+ "bits": 8,
2843
+ "mode": "affine"
2844
  },
2845
  "model.layers.41.linear_attn.in_proj_ba": {
2846
  "group_size": 64,
2847
+ "bits": 8,
2848
  "mode": "affine"
2849
  },
2850
  "model.layers.41.linear_attn.out_proj": {
2851
  "group_size": 64,
2852
+ "bits": 8,
2853
  "mode": "affine"
2854
  },
2855
  "model.layers.41.mlp.switch_mlp.gate_proj": {
 
2868
  "mode": "mxfp4"
2869
  },
2870
  "model.layers.42.linear_attn.in_proj_qkvz": {
2871
+ "group_size": 64,
2872
+ "bits": 8,
2873
+ "mode": "affine"
2874
  },
2875
  "model.layers.42.linear_attn.in_proj_ba": {
2876
  "group_size": 64,
2877
+ "bits": 8,
2878
  "mode": "affine"
2879
  },
2880
  "model.layers.42.linear_attn.out_proj": {
2881
  "group_size": 64,
2882
+ "bits": 8,
2883
  "mode": "affine"
2884
  },
2885
  "model.layers.42.mlp.switch_mlp.gate_proj": {
 
2893
  "mode": "mxfp4"
2894
  },
2895
  "model.layers.42.mlp.switch_mlp.down_proj": {
2896
+ "group_size": 32,
2897
+ "bits": 4,
2898
+ "mode": "mxfp4"
2899
  },
2900
  "model.layers.43.self_attn.q_proj": {
2901
  "group_size": 64,
2902
+ "bits": 8,
2903
  "mode": "affine"
2904
  },
2905
  "model.layers.43.self_attn.k_proj": {
2906
  "group_size": 64,
2907
+ "bits": 8,
2908
  "mode": "affine"
2909
  },
2910
  "model.layers.43.self_attn.v_proj": {
2911
  "group_size": 64,
2912
+ "bits": 8,
2913
  "mode": "affine"
2914
  },
2915
  "model.layers.43.self_attn.o_proj": {
2916
  "group_size": 64,
2917
+ "bits": 8,
2918
  "mode": "affine"
2919
  },
2920
  "model.layers.43.mlp.switch_mlp.gate_proj": {
 
2928
  "mode": "mxfp4"
2929
  },
2930
  "model.layers.43.mlp.switch_mlp.down_proj": {
 
 
 
 
 
2931
  "group_size": 32,
2932
  "bits": 4,
2933
  "mode": "mxfp4"
2934
  },
2935
+ "model.layers.44.linear_attn.in_proj_qkvz": {
2936
+ "group_size": 64,
2937
+ "bits": 8,
2938
+ "mode": "affine"
2939
+ },
2940
  "model.layers.44.linear_attn.in_proj_ba": {
2941
  "group_size": 64,
2942
+ "bits": 8,
2943
  "mode": "affine"
2944
  },
2945
  "model.layers.44.linear_attn.out_proj": {
2946
  "group_size": 64,
2947
+ "bits": 8,
2948
  "mode": "affine"
2949
  },
2950
  "model.layers.44.mlp.switch_mlp.gate_proj": {
 
2964
  },
2965
  "model.layers.45.linear_attn.in_proj_qkvz": {
2966
  "group_size": 64,
2967
+ "bits": 8,
2968
  "mode": "affine"
2969
  },
2970
  "model.layers.45.linear_attn.in_proj_ba": {
2971
  "group_size": 64,
2972
+ "bits": 8,
2973
  "mode": "affine"
2974
  },
2975
  "model.layers.45.linear_attn.out_proj": {
 
2988
  "mode": "mxfp4"
2989
  },
2990
  "model.layers.45.mlp.switch_mlp.down_proj": {
2991
+ "group_size": 32,
2992
+ "bits": 4,
2993
+ "mode": "mxfp4"
2994
  },
2995
  "model.layers.46.linear_attn.in_proj_qkvz": {
2996
  "group_size": 64,
2997
+ "bits": 8,
2998
  "mode": "affine"
2999
  },
3000
  "model.layers.46.linear_attn.in_proj_ba": {
3001
  "group_size": 64,
3002
+ "bits": 8,
3003
  "mode": "affine"
3004
  },
3005
  "model.layers.46.linear_attn.out_proj": {
 
3018
  "mode": "mxfp4"
3019
  },
3020
  "model.layers.46.mlp.switch_mlp.down_proj": {
3021
+ "group_size": 32,
3022
+ "bits": 4,
3023
+ "mode": "mxfp4"
3024
  },
3025
  "model.layers.47.self_attn.q_proj": {
3026
  "group_size": 64,
3027
+ "bits": 8,
3028
  "mode": "affine"
3029
  },
3030
  "model.layers.47.self_attn.k_proj": {
3031
  "group_size": 64,
3032
+ "bits": 8,
3033
  "mode": "affine"
3034
  },
3035
  "model.layers.47.self_attn.v_proj": {
3036
  "group_size": 64,
3037
+ "bits": 8,
3038
  "mode": "affine"
3039
  },
3040
  "model.layers.47.self_attn.o_proj": {
3041
  "group_size": 64,
3042
+ "bits": 8,
3043
  "mode": "affine"
3044
  },
3045
  "model.layers.47.mlp.switch_mlp.gate_proj": {
 
3053
  "mode": "mxfp4"
3054
  },
3055
  "model.layers.47.mlp.switch_mlp.down_proj": {
3056
+ "group_size": 32,
3057
+ "bits": 4,
3058
+ "mode": "mxfp4"
3059
  },
3060
  "lm_head": {
3061
  "group_size": 64,
3062
+ "bits": 8,
3063
  "mode": "affine"
3064
  }
3065
  },
model-00001-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2affc16c11733e8eb69ff481afbf1b96fcc2feae90fdb93aec4aa7fc05c165b9
3
- size 5307013181
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a19555cb848e8db762bcd0423f35fd9b6fa3e2ef0f9daea337f8a378051b949
3
+ size 5136685826
model-00002-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9f1cd72bf80e0aa83409d81969ee4350a75a53d8713a58a13e764ce2a8feede
3
- size 5279957955
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05d512b2820924fe14ae37f6c469d91147e8acbc8cf087ac5d3c7fcc476d9db2
3
+ size 5354539238
model-00003-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9387f3eb3deafc76d59c96c5023072b10a1c7800853bec8ddc59e14ee51d3f3a
3
- size 5136900516
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c63f680db9da22a4651ec48e31ee47385d08fafdb87460ee0a152e5210314b9f
3
+ size 5109835354
model-00004-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:668b6884c6523d380486e336abb1f0bd1047fa122afa6670e331b41f6eeb9fe8
3
- size 5362257844
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08458af2bee021aec677bd00f0ffcabd6c24cd111f7233d16b605c1f6176a5bb
3
+ size 5367122254
model-00005-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13a69d9d11475ff9755f2dde7aedded137d22f93565f37dbb9b44367be242f19
3
- size 5319890802
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:397a531b233251105ce94cfef0f24712d16b3615791cb1da0e2d732760b4bb27
3
+ size 5368422424
model-00006-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16ebb1a6599085e807e6d769df86d7eb7a7c18168eda40c75f724a8e14c7aad1
3
- size 5113745299
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f60cf313663a1ab19ba9f384bee4dceee9c82a90ea9693cab5ccdb3e615087d
3
+ size 5352187259
model-00007-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61d131f5b3f141d9fa9d2fe9f0a79ca3c570f943143b7c295c6293f77ae073c1
3
- size 5128962934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f75b6cf275b49b7e26b762634e190109c6829320f915fbbfd30083564d3e3e6c
3
+ size 5352767299
model-00008-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:730e135811db136ec15460c1e9cfeb80661736be2f54ca05d94b584601d17782
3
- size 4988768634
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbdee11a1b04dd2669539ef1b7d73536923733a7a3a6b8900270a056c8477f5a
3
+ size 5109835416
model-00009-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3aa283b96888a44073b2aeb162d28f8d73f246c8af067a7af1de3dd16b806d6
3
- size 2756930749
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65acab9e2eea14867519ee414621656e5e17bc8a5fb63a34059fad88fa7159da
3
+ size 1508185434
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 44394237440,
4
  "total_parameters": 79674388992
5
  },
6
  "weight_map": {
@@ -17,6 +17,7 @@
17
  "model.layers.0.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
18
  "model.layers.0.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
19
  "model.layers.0.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
 
20
  "model.layers.0.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
21
  "model.layers.0.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
22
  "model.layers.0.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
@@ -50,6 +51,7 @@
50
  "model.layers.1.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
51
  "model.layers.1.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
52
  "model.layers.1.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
 
53
  "model.layers.1.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
54
  "model.layers.1.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
55
  "model.layers.1.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
@@ -69,7 +71,6 @@
69
  "model.layers.1.mlp.shared_expert_gate.biases": "model-00001-of-00009.safetensors",
70
  "model.layers.1.mlp.shared_expert_gate.scales": "model-00001-of-00009.safetensors",
71
  "model.layers.1.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
72
- "model.layers.1.mlp.switch_mlp.down_proj.biases": "model-00001-of-00009.safetensors",
73
  "model.layers.1.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
74
  "model.layers.1.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
75
  "model.layers.1.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00009.safetensors",
@@ -84,6 +85,7 @@
84
  "model.layers.10.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
85
  "model.layers.10.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
86
  "model.layers.10.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
 
87
  "model.layers.10.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
88
  "model.layers.10.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
89
  "model.layers.10.linear_attn.norm.weight": "model-00002-of-00009.safetensors",
@@ -103,7 +105,6 @@
103
  "model.layers.10.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
104
  "model.layers.10.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
105
  "model.layers.10.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
106
- "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00002-of-00009.safetensors",
107
  "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
108
  "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
109
  "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00009.safetensors",
@@ -127,19 +128,23 @@
127
  "model.layers.11.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
128
  "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
129
  "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
130
- "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00009.safetensors",
131
  "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00009.safetensors",
132
  "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00003-of-00009.safetensors",
133
  "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00003-of-00009.safetensors",
134
  "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
135
  "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00009.safetensors",
 
136
  "model.layers.11.self_attn.k_proj.scales": "model-00002-of-00009.safetensors",
137
  "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
 
138
  "model.layers.11.self_attn.o_proj.scales": "model-00002-of-00009.safetensors",
139
  "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00009.safetensors",
140
  "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00009.safetensors",
 
141
  "model.layers.11.self_attn.q_proj.scales": "model-00002-of-00009.safetensors",
142
  "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00009.safetensors",
 
143
  "model.layers.11.self_attn.v_proj.scales": "model-00002-of-00009.safetensors",
144
  "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00009.safetensors",
145
  "model.layers.12.input_layernorm.weight": "model-00003-of-00009.safetensors",
@@ -149,6 +154,7 @@
149
  "model.layers.12.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
150
  "model.layers.12.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
151
  "model.layers.12.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
 
152
  "model.layers.12.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
153
  "model.layers.12.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
154
  "model.layers.12.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
@@ -182,6 +188,7 @@
182
  "model.layers.13.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
183
  "model.layers.13.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
184
  "model.layers.13.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
 
185
  "model.layers.13.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
186
  "model.layers.13.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
187
  "model.layers.13.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
@@ -215,6 +222,7 @@
215
  "model.layers.14.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
216
  "model.layers.14.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
217
  "model.layers.14.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
 
218
  "model.layers.14.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
219
  "model.layers.14.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
220
  "model.layers.14.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
@@ -234,7 +242,6 @@
234
  "model.layers.14.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
235
  "model.layers.14.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
236
  "model.layers.14.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
237
- "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00003-of-00009.safetensors",
238
  "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
239
  "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
240
  "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00009.safetensors",
@@ -264,13 +271,17 @@
264
  "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00003-of-00009.safetensors",
265
  "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
266
  "model.layers.15.self_attn.k_norm.weight": "model-00003-of-00009.safetensors",
 
267
  "model.layers.15.self_attn.k_proj.scales": "model-00003-of-00009.safetensors",
268
  "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
 
269
  "model.layers.15.self_attn.o_proj.scales": "model-00003-of-00009.safetensors",
270
  "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00009.safetensors",
271
  "model.layers.15.self_attn.q_norm.weight": "model-00003-of-00009.safetensors",
 
272
  "model.layers.15.self_attn.q_proj.scales": "model-00003-of-00009.safetensors",
273
  "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00009.safetensors",
 
274
  "model.layers.15.self_attn.v_proj.scales": "model-00003-of-00009.safetensors",
275
  "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00009.safetensors",
276
  "model.layers.16.input_layernorm.weight": "model-00003-of-00009.safetensors",
@@ -280,6 +291,7 @@
280
  "model.layers.16.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
281
  "model.layers.16.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
282
  "model.layers.16.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
 
283
  "model.layers.16.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
284
  "model.layers.16.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
285
  "model.layers.16.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
@@ -313,6 +325,7 @@
313
  "model.layers.17.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
314
  "model.layers.17.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
315
  "model.layers.17.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
 
316
  "model.layers.17.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
317
  "model.layers.17.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
318
  "model.layers.17.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
@@ -346,6 +359,7 @@
346
  "model.layers.18.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
347
  "model.layers.18.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
348
  "model.layers.18.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
 
349
  "model.layers.18.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
350
  "model.layers.18.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
351
  "model.layers.18.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
@@ -394,13 +408,17 @@
394
  "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
395
  "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
396
  "model.layers.19.self_attn.k_norm.weight": "model-00004-of-00009.safetensors",
 
397
  "model.layers.19.self_attn.k_proj.scales": "model-00004-of-00009.safetensors",
398
  "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",
 
399
  "model.layers.19.self_attn.o_proj.scales": "model-00004-of-00009.safetensors",
400
  "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00009.safetensors",
401
  "model.layers.19.self_attn.q_norm.weight": "model-00004-of-00009.safetensors",
 
402
  "model.layers.19.self_attn.q_proj.scales": "model-00004-of-00009.safetensors",
403
  "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00009.safetensors",
 
404
  "model.layers.19.self_attn.v_proj.scales": "model-00004-of-00009.safetensors",
405
  "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00009.safetensors",
406
  "model.layers.2.input_layernorm.weight": "model-00001-of-00009.safetensors",
@@ -410,6 +428,7 @@
410
  "model.layers.2.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
411
  "model.layers.2.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
412
  "model.layers.2.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
 
413
  "model.layers.2.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
414
  "model.layers.2.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
415
  "model.layers.2.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
@@ -443,6 +462,7 @@
443
  "model.layers.20.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
444
  "model.layers.20.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
445
  "model.layers.20.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
 
446
  "model.layers.20.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
447
  "model.layers.20.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
448
  "model.layers.20.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
@@ -476,6 +496,7 @@
476
  "model.layers.21.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
477
  "model.layers.21.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
478
  "model.layers.21.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
 
479
  "model.layers.21.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
480
  "model.layers.21.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
481
  "model.layers.21.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
@@ -509,6 +530,7 @@
509
  "model.layers.22.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
510
  "model.layers.22.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
511
  "model.layers.22.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
 
512
  "model.layers.22.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
513
  "model.layers.22.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
514
  "model.layers.22.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
@@ -516,27 +538,26 @@
516
  "model.layers.22.linear_attn.out_proj.scales": "model-00004-of-00009.safetensors",
517
  "model.layers.22.linear_attn.out_proj.weight": "model-00004-of-00009.safetensors",
518
  "model.layers.22.mlp.gate.weight": "model-00004-of-00009.safetensors",
519
- "model.layers.22.mlp.shared_expert.down_proj.biases": "model-00005-of-00009.safetensors",
520
- "model.layers.22.mlp.shared_expert.down_proj.scales": "model-00005-of-00009.safetensors",
521
- "model.layers.22.mlp.shared_expert.down_proj.weight": "model-00005-of-00009.safetensors",
522
- "model.layers.22.mlp.shared_expert.gate_proj.biases": "model-00005-of-00009.safetensors",
523
- "model.layers.22.mlp.shared_expert.gate_proj.scales": "model-00005-of-00009.safetensors",
524
- "model.layers.22.mlp.shared_expert.gate_proj.weight": "model-00005-of-00009.safetensors",
525
- "model.layers.22.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
526
- "model.layers.22.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
527
- "model.layers.22.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
528
- "model.layers.22.mlp.shared_expert_gate.biases": "model-00005-of-00009.safetensors",
529
- "model.layers.22.mlp.shared_expert_gate.scales": "model-00005-of-00009.safetensors",
530
- "model.layers.22.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
531
- "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00005-of-00009.safetensors",
532
- "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
533
  "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
534
  "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00009.safetensors",
535
  "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00009.safetensors",
536
  "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00004-of-00009.safetensors",
537
  "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
538
  "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
539
- "model.layers.23.input_layernorm.weight": "model-00005-of-00009.safetensors",
540
  "model.layers.23.mlp.gate.weight": "model-00005-of-00009.safetensors",
541
  "model.layers.23.mlp.shared_expert.down_proj.biases": "model-00005-of-00009.safetensors",
542
  "model.layers.23.mlp.shared_expert.down_proj.scales": "model-00005-of-00009.safetensors",
@@ -556,17 +577,21 @@
556
  "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00009.safetensors",
557
  "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
558
  "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
559
- "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
560
- "model.layers.23.self_attn.k_norm.weight": "model-00005-of-00009.safetensors",
561
- "model.layers.23.self_attn.k_proj.scales": "model-00005-of-00009.safetensors",
562
- "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
563
- "model.layers.23.self_attn.o_proj.scales": "model-00005-of-00009.safetensors",
564
- "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00009.safetensors",
565
- "model.layers.23.self_attn.q_norm.weight": "model-00005-of-00009.safetensors",
566
- "model.layers.23.self_attn.q_proj.scales": "model-00005-of-00009.safetensors",
567
- "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00009.safetensors",
568
- "model.layers.23.self_attn.v_proj.scales": "model-00005-of-00009.safetensors",
569
- "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00009.safetensors",
 
 
 
 
570
  "model.layers.24.input_layernorm.weight": "model-00005-of-00009.safetensors",
571
  "model.layers.24.linear_attn.A_log": "model-00005-of-00009.safetensors",
572
  "model.layers.24.linear_attn.conv1d.weight": "model-00005-of-00009.safetensors",
@@ -574,6 +599,7 @@
574
  "model.layers.24.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
575
  "model.layers.24.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
576
  "model.layers.24.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
 
577
  "model.layers.24.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
578
  "model.layers.24.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
579
  "model.layers.24.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
@@ -607,6 +633,7 @@
607
  "model.layers.25.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
608
  "model.layers.25.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
609
  "model.layers.25.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
 
610
  "model.layers.25.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
611
  "model.layers.25.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
612
  "model.layers.25.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
@@ -640,6 +667,7 @@
640
  "model.layers.26.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
641
  "model.layers.26.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
642
  "model.layers.26.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
 
643
  "model.layers.26.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
644
  "model.layers.26.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
645
  "model.layers.26.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
@@ -688,13 +716,17 @@
688
  "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
689
  "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
690
  "model.layers.27.self_attn.k_norm.weight": "model-00005-of-00009.safetensors",
 
691
  "model.layers.27.self_attn.k_proj.scales": "model-00005-of-00009.safetensors",
692
  "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
 
693
  "model.layers.27.self_attn.o_proj.scales": "model-00005-of-00009.safetensors",
694
  "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00009.safetensors",
695
  "model.layers.27.self_attn.q_norm.weight": "model-00005-of-00009.safetensors",
 
696
  "model.layers.27.self_attn.q_proj.scales": "model-00005-of-00009.safetensors",
697
  "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00009.safetensors",
 
698
  "model.layers.27.self_attn.v_proj.scales": "model-00005-of-00009.safetensors",
699
  "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00009.safetensors",
700
  "model.layers.28.input_layernorm.weight": "model-00005-of-00009.safetensors",
@@ -704,6 +736,7 @@
704
  "model.layers.28.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
705
  "model.layers.28.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
706
  "model.layers.28.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
 
707
  "model.layers.28.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
708
  "model.layers.28.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
709
  "model.layers.28.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
@@ -730,20 +763,21 @@
730
  "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
731
  "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
732
  "model.layers.28.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
733
- "model.layers.29.input_layernorm.weight": "model-00005-of-00009.safetensors",
734
- "model.layers.29.linear_attn.A_log": "model-00005-of-00009.safetensors",
735
  "model.layers.29.linear_attn.conv1d.weight": "model-00005-of-00009.safetensors",
736
- "model.layers.29.linear_attn.dt_bias": "model-00005-of-00009.safetensors",
737
- "model.layers.29.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
738
- "model.layers.29.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
739
- "model.layers.29.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
 
740
  "model.layers.29.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
741
  "model.layers.29.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
742
- "model.layers.29.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
743
- "model.layers.29.linear_attn.out_proj.biases": "model-00005-of-00009.safetensors",
744
- "model.layers.29.linear_attn.out_proj.scales": "model-00005-of-00009.safetensors",
745
- "model.layers.29.linear_attn.out_proj.weight": "model-00005-of-00009.safetensors",
746
- "model.layers.29.mlp.gate.weight": "model-00005-of-00009.safetensors",
747
  "model.layers.29.mlp.shared_expert.down_proj.biases": "model-00006-of-00009.safetensors",
748
  "model.layers.29.mlp.shared_expert.down_proj.scales": "model-00006-of-00009.safetensors",
749
  "model.layers.29.mlp.shared_expert.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -762,7 +796,7 @@
762
  "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00009.safetensors",
763
  "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
764
  "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
765
- "model.layers.29.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
766
  "model.layers.3.input_layernorm.weight": "model-00001-of-00009.safetensors",
767
  "model.layers.3.mlp.gate.weight": "model-00001-of-00009.safetensors",
768
  "model.layers.3.mlp.shared_expert.down_proj.biases": "model-00001-of-00009.safetensors",
@@ -785,13 +819,17 @@
785
  "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00009.safetensors",
786
  "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
787
  "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00009.safetensors",
 
788
  "model.layers.3.self_attn.k_proj.scales": "model-00001-of-00009.safetensors",
789
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
 
790
  "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00009.safetensors",
791
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00009.safetensors",
792
  "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00009.safetensors",
 
793
  "model.layers.3.self_attn.q_proj.scales": "model-00001-of-00009.safetensors",
794
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00009.safetensors",
 
795
  "model.layers.3.self_attn.v_proj.scales": "model-00001-of-00009.safetensors",
796
  "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00009.safetensors",
797
  "model.layers.30.input_layernorm.weight": "model-00006-of-00009.safetensors",
@@ -801,6 +839,7 @@
801
  "model.layers.30.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
802
  "model.layers.30.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
803
  "model.layers.30.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
 
804
  "model.layers.30.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
805
  "model.layers.30.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
806
  "model.layers.30.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
@@ -841,7 +880,6 @@
841
  "model.layers.31.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
842
  "model.layers.31.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
843
  "model.layers.31.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
844
- "model.layers.31.mlp.switch_mlp.down_proj.biases": "model-00006-of-00009.safetensors",
845
  "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
846
  "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
847
  "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00009.safetensors",
@@ -850,13 +888,17 @@
850
  "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
851
  "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
852
  "model.layers.31.self_attn.k_norm.weight": "model-00006-of-00009.safetensors",
 
853
  "model.layers.31.self_attn.k_proj.scales": "model-00006-of-00009.safetensors",
854
  "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
 
855
  "model.layers.31.self_attn.o_proj.scales": "model-00006-of-00009.safetensors",
856
  "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00009.safetensors",
857
  "model.layers.31.self_attn.q_norm.weight": "model-00006-of-00009.safetensors",
 
858
  "model.layers.31.self_attn.q_proj.scales": "model-00006-of-00009.safetensors",
859
  "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00009.safetensors",
 
860
  "model.layers.31.self_attn.v_proj.scales": "model-00006-of-00009.safetensors",
861
  "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00009.safetensors",
862
  "model.layers.32.input_layernorm.weight": "model-00006-of-00009.safetensors",
@@ -866,6 +908,7 @@
866
  "model.layers.32.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
867
  "model.layers.32.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
868
  "model.layers.32.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
 
869
  "model.layers.32.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
870
  "model.layers.32.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
871
  "model.layers.32.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
@@ -899,6 +942,7 @@
899
  "model.layers.33.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
900
  "model.layers.33.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
901
  "model.layers.33.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
 
902
  "model.layers.33.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
903
  "model.layers.33.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
904
  "model.layers.33.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
@@ -932,6 +976,7 @@
932
  "model.layers.34.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
933
  "model.layers.34.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
934
  "model.layers.34.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
 
935
  "model.layers.34.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
936
  "model.layers.34.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
937
  "model.layers.34.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
@@ -939,20 +984,20 @@
939
  "model.layers.34.linear_attn.out_proj.scales": "model-00006-of-00009.safetensors",
940
  "model.layers.34.linear_attn.out_proj.weight": "model-00006-of-00009.safetensors",
941
  "model.layers.34.mlp.gate.weight": "model-00006-of-00009.safetensors",
942
- "model.layers.34.mlp.shared_expert.down_proj.biases": "model-00007-of-00009.safetensors",
943
- "model.layers.34.mlp.shared_expert.down_proj.scales": "model-00007-of-00009.safetensors",
944
- "model.layers.34.mlp.shared_expert.down_proj.weight": "model-00007-of-00009.safetensors",
945
- "model.layers.34.mlp.shared_expert.gate_proj.biases": "model-00007-of-00009.safetensors",
946
- "model.layers.34.mlp.shared_expert.gate_proj.scales": "model-00007-of-00009.safetensors",
947
- "model.layers.34.mlp.shared_expert.gate_proj.weight": "model-00007-of-00009.safetensors",
948
- "model.layers.34.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
949
- "model.layers.34.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
950
- "model.layers.34.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
951
- "model.layers.34.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
952
- "model.layers.34.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
953
- "model.layers.34.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
954
- "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
955
- "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
956
  "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00009.safetensors",
957
  "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00009.safetensors",
958
  "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
@@ -980,13 +1025,17 @@
980
  "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
981
  "model.layers.35.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
982
  "model.layers.35.self_attn.k_norm.weight": "model-00007-of-00009.safetensors",
 
983
  "model.layers.35.self_attn.k_proj.scales": "model-00007-of-00009.safetensors",
984
  "model.layers.35.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
 
985
  "model.layers.35.self_attn.o_proj.scales": "model-00007-of-00009.safetensors",
986
  "model.layers.35.self_attn.o_proj.weight": "model-00007-of-00009.safetensors",
987
  "model.layers.35.self_attn.q_norm.weight": "model-00007-of-00009.safetensors",
 
988
  "model.layers.35.self_attn.q_proj.scales": "model-00007-of-00009.safetensors",
989
  "model.layers.35.self_attn.q_proj.weight": "model-00007-of-00009.safetensors",
 
990
  "model.layers.35.self_attn.v_proj.scales": "model-00007-of-00009.safetensors",
991
  "model.layers.35.self_attn.v_proj.weight": "model-00007-of-00009.safetensors",
992
  "model.layers.36.input_layernorm.weight": "model-00007-of-00009.safetensors",
@@ -996,6 +1045,7 @@
996
  "model.layers.36.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
997
  "model.layers.36.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
998
  "model.layers.36.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
 
999
  "model.layers.36.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1000
  "model.layers.36.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1001
  "model.layers.36.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
@@ -1029,6 +1079,7 @@
1029
  "model.layers.37.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1030
  "model.layers.37.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1031
  "model.layers.37.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
 
1032
  "model.layers.37.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1033
  "model.layers.37.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1034
  "model.layers.37.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
@@ -1062,6 +1113,7 @@
1062
  "model.layers.38.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1063
  "model.layers.38.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1064
  "model.layers.38.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
 
1065
  "model.layers.38.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1066
  "model.layers.38.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1067
  "model.layers.38.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
@@ -1102,7 +1154,6 @@
1102
  "model.layers.39.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
1103
  "model.layers.39.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
1104
  "model.layers.39.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1105
- "model.layers.39.mlp.switch_mlp.down_proj.biases": "model-00007-of-00009.safetensors",
1106
  "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1107
  "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
1108
  "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00009.safetensors",
@@ -1111,13 +1162,17 @@
1111
  "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
1112
  "model.layers.39.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1113
  "model.layers.39.self_attn.k_norm.weight": "model-00007-of-00009.safetensors",
 
1114
  "model.layers.39.self_attn.k_proj.scales": "model-00007-of-00009.safetensors",
1115
  "model.layers.39.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
 
1116
  "model.layers.39.self_attn.o_proj.scales": "model-00007-of-00009.safetensors",
1117
  "model.layers.39.self_attn.o_proj.weight": "model-00007-of-00009.safetensors",
1118
  "model.layers.39.self_attn.q_norm.weight": "model-00007-of-00009.safetensors",
 
1119
  "model.layers.39.self_attn.q_proj.scales": "model-00007-of-00009.safetensors",
1120
  "model.layers.39.self_attn.q_proj.weight": "model-00007-of-00009.safetensors",
 
1121
  "model.layers.39.self_attn.v_proj.scales": "model-00007-of-00009.safetensors",
1122
  "model.layers.39.self_attn.v_proj.weight": "model-00007-of-00009.safetensors",
1123
  "model.layers.4.input_layernorm.weight": "model-00001-of-00009.safetensors",
@@ -1127,6 +1182,7 @@
1127
  "model.layers.4.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
1128
  "model.layers.4.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
1129
  "model.layers.4.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
 
1130
  "model.layers.4.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
1131
  "model.layers.4.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
1132
  "model.layers.4.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
@@ -1160,6 +1216,7 @@
1160
  "model.layers.40.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1161
  "model.layers.40.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1162
  "model.layers.40.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
 
1163
  "model.layers.40.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1164
  "model.layers.40.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1165
  "model.layers.40.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
@@ -1180,11 +1237,11 @@
1180
  "model.layers.40.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1181
  "model.layers.40.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1182
  "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1183
- "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
1184
  "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00009.safetensors",
1185
  "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00009.safetensors",
1186
- "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00008-of-00009.safetensors",
1187
- "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00008-of-00009.safetensors",
1188
  "model.layers.40.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1189
  "model.layers.41.input_layernorm.weight": "model-00008-of-00009.safetensors",
1190
  "model.layers.41.linear_attn.A_log": "model-00008-of-00009.safetensors",
@@ -1193,6 +1250,7 @@
1193
  "model.layers.41.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1194
  "model.layers.41.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1195
  "model.layers.41.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
 
1196
  "model.layers.41.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1197
  "model.layers.41.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1198
  "model.layers.41.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
@@ -1226,6 +1284,7 @@
1226
  "model.layers.42.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1227
  "model.layers.42.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1228
  "model.layers.42.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
 
1229
  "model.layers.42.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1230
  "model.layers.42.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1231
  "model.layers.42.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
@@ -1245,7 +1304,6 @@
1245
  "model.layers.42.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1246
  "model.layers.42.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1247
  "model.layers.42.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1248
- "model.layers.42.mlp.switch_mlp.down_proj.biases": "model-00008-of-00009.safetensors",
1249
  "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1250
  "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
1251
  "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
@@ -1267,7 +1325,6 @@
1267
  "model.layers.43.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1268
  "model.layers.43.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1269
  "model.layers.43.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1270
- "model.layers.43.mlp.switch_mlp.down_proj.biases": "model-00008-of-00009.safetensors",
1271
  "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1272
  "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
1273
  "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
@@ -1296,6 +1353,7 @@
1296
  "model.layers.44.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1297
  "model.layers.44.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1298
  "model.layers.44.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
 
1299
  "model.layers.44.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1300
  "model.layers.44.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1301
  "model.layers.44.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
@@ -1337,41 +1395,40 @@
1337
  "model.layers.45.linear_attn.out_proj.scales": "model-00008-of-00009.safetensors",
1338
  "model.layers.45.linear_attn.out_proj.weight": "model-00008-of-00009.safetensors",
1339
  "model.layers.45.mlp.gate.weight": "model-00008-of-00009.safetensors",
1340
- "model.layers.45.mlp.shared_expert.down_proj.biases": "model-00009-of-00009.safetensors",
1341
- "model.layers.45.mlp.shared_expert.down_proj.scales": "model-00009-of-00009.safetensors",
1342
- "model.layers.45.mlp.shared_expert.down_proj.weight": "model-00009-of-00009.safetensors",
1343
- "model.layers.45.mlp.shared_expert.gate_proj.biases": "model-00009-of-00009.safetensors",
1344
- "model.layers.45.mlp.shared_expert.gate_proj.scales": "model-00009-of-00009.safetensors",
1345
- "model.layers.45.mlp.shared_expert.gate_proj.weight": "model-00009-of-00009.safetensors",
1346
- "model.layers.45.mlp.shared_expert.up_proj.biases": "model-00009-of-00009.safetensors",
1347
- "model.layers.45.mlp.shared_expert.up_proj.scales": "model-00009-of-00009.safetensors",
1348
- "model.layers.45.mlp.shared_expert.up_proj.weight": "model-00009-of-00009.safetensors",
1349
- "model.layers.45.mlp.shared_expert_gate.biases": "model-00009-of-00009.safetensors",
1350
- "model.layers.45.mlp.shared_expert_gate.scales": "model-00009-of-00009.safetensors",
1351
- "model.layers.45.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
1352
- "model.layers.45.mlp.switch_mlp.down_proj.biases": "model-00009-of-00009.safetensors",
1353
- "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1354
- "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
1355
  "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
1356
  "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00009.safetensors",
1357
  "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00008-of-00009.safetensors",
1358
  "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00008-of-00009.safetensors",
1359
  "model.layers.45.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
1360
- "model.layers.46.input_layernorm.weight": "model-00009-of-00009.safetensors",
1361
- "model.layers.46.linear_attn.A_log": "model-00009-of-00009.safetensors",
1362
- "model.layers.46.linear_attn.conv1d.weight": "model-00009-of-00009.safetensors",
1363
- "model.layers.46.linear_attn.dt_bias": "model-00009-of-00009.safetensors",
1364
- "model.layers.46.linear_attn.in_proj_ba.biases": "model-00009-of-00009.safetensors",
1365
- "model.layers.46.linear_attn.in_proj_ba.scales": "model-00009-of-00009.safetensors",
1366
- "model.layers.46.linear_attn.in_proj_ba.weight": "model-00009-of-00009.safetensors",
1367
- "model.layers.46.linear_attn.in_proj_qkvz.biases": "model-00009-of-00009.safetensors",
1368
- "model.layers.46.linear_attn.in_proj_qkvz.scales": "model-00009-of-00009.safetensors",
1369
- "model.layers.46.linear_attn.in_proj_qkvz.weight": "model-00009-of-00009.safetensors",
1370
- "model.layers.46.linear_attn.norm.weight": "model-00009-of-00009.safetensors",
1371
- "model.layers.46.linear_attn.out_proj.biases": "model-00009-of-00009.safetensors",
1372
- "model.layers.46.linear_attn.out_proj.scales": "model-00009-of-00009.safetensors",
1373
- "model.layers.46.linear_attn.out_proj.weight": "model-00009-of-00009.safetensors",
1374
- "model.layers.46.mlp.gate.weight": "model-00009-of-00009.safetensors",
1375
  "model.layers.46.mlp.shared_expert.down_proj.biases": "model-00009-of-00009.safetensors",
1376
  "model.layers.46.mlp.shared_expert.down_proj.scales": "model-00009-of-00009.safetensors",
1377
  "model.layers.46.mlp.shared_expert.down_proj.weight": "model-00009-of-00009.safetensors",
@@ -1384,14 +1441,13 @@
1384
  "model.layers.46.mlp.shared_expert_gate.biases": "model-00009-of-00009.safetensors",
1385
  "model.layers.46.mlp.shared_expert_gate.scales": "model-00009-of-00009.safetensors",
1386
  "model.layers.46.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
1387
- "model.layers.46.mlp.switch_mlp.down_proj.biases": "model-00009-of-00009.safetensors",
1388
  "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1389
  "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
1390
- "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00009.safetensors",
1391
- "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00009.safetensors",
1392
- "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00009-of-00009.safetensors",
1393
- "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00009-of-00009.safetensors",
1394
- "model.layers.46.post_attention_layernorm.weight": "model-00009-of-00009.safetensors",
1395
  "model.layers.47.input_layernorm.weight": "model-00009-of-00009.safetensors",
1396
  "model.layers.47.mlp.gate.weight": "model-00009-of-00009.safetensors",
1397
  "model.layers.47.mlp.shared_expert.down_proj.biases": "model-00009-of-00009.safetensors",
@@ -1406,7 +1462,6 @@
1406
  "model.layers.47.mlp.shared_expert_gate.biases": "model-00009-of-00009.safetensors",
1407
  "model.layers.47.mlp.shared_expert_gate.scales": "model-00009-of-00009.safetensors",
1408
  "model.layers.47.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
1409
- "model.layers.47.mlp.switch_mlp.down_proj.biases": "model-00009-of-00009.safetensors",
1410
  "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1411
  "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
1412
  "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00009.safetensors",
@@ -1435,6 +1490,7 @@
1435
  "model.layers.5.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
1436
  "model.layers.5.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
1437
  "model.layers.5.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
 
1438
  "model.layers.5.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
1439
  "model.layers.5.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
1440
  "model.layers.5.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
@@ -1458,8 +1514,8 @@
1458
  "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
1459
  "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00009.safetensors",
1460
  "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00009.safetensors",
1461
- "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00001-of-00009.safetensors",
1462
- "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00001-of-00009.safetensors",
1463
  "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
1464
  "model.layers.6.input_layernorm.weight": "model-00002-of-00009.safetensors",
1465
  "model.layers.6.linear_attn.A_log": "model-00002-of-00009.safetensors",
@@ -1468,6 +1524,7 @@
1468
  "model.layers.6.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
1469
  "model.layers.6.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
1470
  "model.layers.6.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
 
1471
  "model.layers.6.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
1472
  "model.layers.6.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
1473
  "model.layers.6.linear_attn.norm.weight": "model-00002-of-00009.safetensors",
@@ -1487,7 +1544,6 @@
1487
  "model.layers.6.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
1488
  "model.layers.6.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
1489
  "model.layers.6.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1490
- "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00002-of-00009.safetensors",
1491
  "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1492
  "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
1493
  "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00009.safetensors",
@@ -1517,13 +1573,17 @@
1517
  "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00002-of-00009.safetensors",
1518
  "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
1519
  "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00009.safetensors",
 
1520
  "model.layers.7.self_attn.k_proj.scales": "model-00002-of-00009.safetensors",
1521
  "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
 
1522
  "model.layers.7.self_attn.o_proj.scales": "model-00002-of-00009.safetensors",
1523
  "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00009.safetensors",
1524
  "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00009.safetensors",
 
1525
  "model.layers.7.self_attn.q_proj.scales": "model-00002-of-00009.safetensors",
1526
  "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00009.safetensors",
 
1527
  "model.layers.7.self_attn.v_proj.scales": "model-00002-of-00009.safetensors",
1528
  "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00009.safetensors",
1529
  "model.layers.8.input_layernorm.weight": "model-00002-of-00009.safetensors",
@@ -1533,6 +1593,7 @@
1533
  "model.layers.8.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
1534
  "model.layers.8.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
1535
  "model.layers.8.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
 
1536
  "model.layers.8.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
1537
  "model.layers.8.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
1538
  "model.layers.8.linear_attn.norm.weight": "model-00002-of-00009.safetensors",
@@ -1566,6 +1627,7 @@
1566
  "model.layers.9.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
1567
  "model.layers.9.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
1568
  "model.layers.9.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
 
1569
  "model.layers.9.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
1570
  "model.layers.9.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
1571
  "model.layers.9.linear_attn.norm.weight": "model-00002-of-00009.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 43659382272,
4
  "total_parameters": 79674388992
5
  },
6
  "weight_map": {
 
17
  "model.layers.0.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
18
  "model.layers.0.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
19
  "model.layers.0.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
20
+ "model.layers.0.linear_attn.in_proj_qkvz.biases": "model-00001-of-00009.safetensors",
21
  "model.layers.0.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
22
  "model.layers.0.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
23
  "model.layers.0.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
 
51
  "model.layers.1.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
52
  "model.layers.1.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
53
  "model.layers.1.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
54
+ "model.layers.1.linear_attn.in_proj_qkvz.biases": "model-00001-of-00009.safetensors",
55
  "model.layers.1.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
56
  "model.layers.1.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
57
  "model.layers.1.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
 
71
  "model.layers.1.mlp.shared_expert_gate.biases": "model-00001-of-00009.safetensors",
72
  "model.layers.1.mlp.shared_expert_gate.scales": "model-00001-of-00009.safetensors",
73
  "model.layers.1.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
 
74
  "model.layers.1.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
75
  "model.layers.1.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
76
  "model.layers.1.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00009.safetensors",
 
85
  "model.layers.10.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
86
  "model.layers.10.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
87
  "model.layers.10.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
88
+ "model.layers.10.linear_attn.in_proj_qkvz.biases": "model-00002-of-00009.safetensors",
89
  "model.layers.10.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
90
  "model.layers.10.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
91
  "model.layers.10.linear_attn.norm.weight": "model-00002-of-00009.safetensors",
 
105
  "model.layers.10.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
106
  "model.layers.10.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
107
  "model.layers.10.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
 
108
  "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
109
  "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
110
  "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00009.safetensors",
 
128
  "model.layers.11.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
129
  "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
130
  "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
131
+ "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00009.safetensors",
132
  "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00009.safetensors",
133
  "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00003-of-00009.safetensors",
134
  "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00003-of-00009.safetensors",
135
  "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
136
  "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00009.safetensors",
137
+ "model.layers.11.self_attn.k_proj.biases": "model-00002-of-00009.safetensors",
138
  "model.layers.11.self_attn.k_proj.scales": "model-00002-of-00009.safetensors",
139
  "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
140
+ "model.layers.11.self_attn.o_proj.biases": "model-00002-of-00009.safetensors",
141
  "model.layers.11.self_attn.o_proj.scales": "model-00002-of-00009.safetensors",
142
  "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00009.safetensors",
143
  "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00009.safetensors",
144
+ "model.layers.11.self_attn.q_proj.biases": "model-00002-of-00009.safetensors",
145
  "model.layers.11.self_attn.q_proj.scales": "model-00002-of-00009.safetensors",
146
  "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00009.safetensors",
147
+ "model.layers.11.self_attn.v_proj.biases": "model-00002-of-00009.safetensors",
148
  "model.layers.11.self_attn.v_proj.scales": "model-00002-of-00009.safetensors",
149
  "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00009.safetensors",
150
  "model.layers.12.input_layernorm.weight": "model-00003-of-00009.safetensors",
 
154
  "model.layers.12.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
155
  "model.layers.12.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
156
  "model.layers.12.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
157
+ "model.layers.12.linear_attn.in_proj_qkvz.biases": "model-00003-of-00009.safetensors",
158
  "model.layers.12.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
159
  "model.layers.12.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
160
  "model.layers.12.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
 
188
  "model.layers.13.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
189
  "model.layers.13.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
190
  "model.layers.13.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
191
+ "model.layers.13.linear_attn.in_proj_qkvz.biases": "model-00003-of-00009.safetensors",
192
  "model.layers.13.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
193
  "model.layers.13.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
194
  "model.layers.13.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
 
222
  "model.layers.14.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
223
  "model.layers.14.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
224
  "model.layers.14.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
225
+ "model.layers.14.linear_attn.in_proj_qkvz.biases": "model-00003-of-00009.safetensors",
226
  "model.layers.14.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
227
  "model.layers.14.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
228
  "model.layers.14.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
 
242
  "model.layers.14.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
243
  "model.layers.14.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
244
  "model.layers.14.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
 
245
  "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
246
  "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
247
  "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00009.safetensors",
 
271
  "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00003-of-00009.safetensors",
272
  "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
273
  "model.layers.15.self_attn.k_norm.weight": "model-00003-of-00009.safetensors",
274
+ "model.layers.15.self_attn.k_proj.biases": "model-00003-of-00009.safetensors",
275
  "model.layers.15.self_attn.k_proj.scales": "model-00003-of-00009.safetensors",
276
  "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
277
+ "model.layers.15.self_attn.o_proj.biases": "model-00003-of-00009.safetensors",
278
  "model.layers.15.self_attn.o_proj.scales": "model-00003-of-00009.safetensors",
279
  "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00009.safetensors",
280
  "model.layers.15.self_attn.q_norm.weight": "model-00003-of-00009.safetensors",
281
+ "model.layers.15.self_attn.q_proj.biases": "model-00003-of-00009.safetensors",
282
  "model.layers.15.self_attn.q_proj.scales": "model-00003-of-00009.safetensors",
283
  "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00009.safetensors",
284
+ "model.layers.15.self_attn.v_proj.biases": "model-00003-of-00009.safetensors",
285
  "model.layers.15.self_attn.v_proj.scales": "model-00003-of-00009.safetensors",
286
  "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00009.safetensors",
287
  "model.layers.16.input_layernorm.weight": "model-00003-of-00009.safetensors",
 
291
  "model.layers.16.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
292
  "model.layers.16.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
293
  "model.layers.16.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
294
+ "model.layers.16.linear_attn.in_proj_qkvz.biases": "model-00003-of-00009.safetensors",
295
  "model.layers.16.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
296
  "model.layers.16.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
297
  "model.layers.16.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
 
325
  "model.layers.17.linear_attn.in_proj_ba.biases": "model-00003-of-00009.safetensors",
326
  "model.layers.17.linear_attn.in_proj_ba.scales": "model-00003-of-00009.safetensors",
327
  "model.layers.17.linear_attn.in_proj_ba.weight": "model-00003-of-00009.safetensors",
328
+ "model.layers.17.linear_attn.in_proj_qkvz.biases": "model-00003-of-00009.safetensors",
329
  "model.layers.17.linear_attn.in_proj_qkvz.scales": "model-00003-of-00009.safetensors",
330
  "model.layers.17.linear_attn.in_proj_qkvz.weight": "model-00003-of-00009.safetensors",
331
  "model.layers.17.linear_attn.norm.weight": "model-00003-of-00009.safetensors",
 
359
  "model.layers.18.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
360
  "model.layers.18.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
361
  "model.layers.18.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
362
+ "model.layers.18.linear_attn.in_proj_qkvz.biases": "model-00004-of-00009.safetensors",
363
  "model.layers.18.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
364
  "model.layers.18.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
365
  "model.layers.18.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
 
408
  "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
409
  "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
410
  "model.layers.19.self_attn.k_norm.weight": "model-00004-of-00009.safetensors",
411
+ "model.layers.19.self_attn.k_proj.biases": "model-00004-of-00009.safetensors",
412
  "model.layers.19.self_attn.k_proj.scales": "model-00004-of-00009.safetensors",
413
  "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",
414
+ "model.layers.19.self_attn.o_proj.biases": "model-00004-of-00009.safetensors",
415
  "model.layers.19.self_attn.o_proj.scales": "model-00004-of-00009.safetensors",
416
  "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00009.safetensors",
417
  "model.layers.19.self_attn.q_norm.weight": "model-00004-of-00009.safetensors",
418
+ "model.layers.19.self_attn.q_proj.biases": "model-00004-of-00009.safetensors",
419
  "model.layers.19.self_attn.q_proj.scales": "model-00004-of-00009.safetensors",
420
  "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00009.safetensors",
421
+ "model.layers.19.self_attn.v_proj.biases": "model-00004-of-00009.safetensors",
422
  "model.layers.19.self_attn.v_proj.scales": "model-00004-of-00009.safetensors",
423
  "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00009.safetensors",
424
  "model.layers.2.input_layernorm.weight": "model-00001-of-00009.safetensors",
 
428
  "model.layers.2.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
429
  "model.layers.2.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
430
  "model.layers.2.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
431
+ "model.layers.2.linear_attn.in_proj_qkvz.biases": "model-00001-of-00009.safetensors",
432
  "model.layers.2.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
433
  "model.layers.2.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
434
  "model.layers.2.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
 
462
  "model.layers.20.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
463
  "model.layers.20.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
464
  "model.layers.20.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
465
+ "model.layers.20.linear_attn.in_proj_qkvz.biases": "model-00004-of-00009.safetensors",
466
  "model.layers.20.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
467
  "model.layers.20.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
468
  "model.layers.20.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
 
496
  "model.layers.21.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
497
  "model.layers.21.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
498
  "model.layers.21.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
499
+ "model.layers.21.linear_attn.in_proj_qkvz.biases": "model-00004-of-00009.safetensors",
500
  "model.layers.21.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
501
  "model.layers.21.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
502
  "model.layers.21.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
 
530
  "model.layers.22.linear_attn.in_proj_ba.biases": "model-00004-of-00009.safetensors",
531
  "model.layers.22.linear_attn.in_proj_ba.scales": "model-00004-of-00009.safetensors",
532
  "model.layers.22.linear_attn.in_proj_ba.weight": "model-00004-of-00009.safetensors",
533
+ "model.layers.22.linear_attn.in_proj_qkvz.biases": "model-00004-of-00009.safetensors",
534
  "model.layers.22.linear_attn.in_proj_qkvz.scales": "model-00004-of-00009.safetensors",
535
  "model.layers.22.linear_attn.in_proj_qkvz.weight": "model-00004-of-00009.safetensors",
536
  "model.layers.22.linear_attn.norm.weight": "model-00004-of-00009.safetensors",
 
538
  "model.layers.22.linear_attn.out_proj.scales": "model-00004-of-00009.safetensors",
539
  "model.layers.22.linear_attn.out_proj.weight": "model-00004-of-00009.safetensors",
540
  "model.layers.22.mlp.gate.weight": "model-00004-of-00009.safetensors",
541
+ "model.layers.22.mlp.shared_expert.down_proj.biases": "model-00004-of-00009.safetensors",
542
+ "model.layers.22.mlp.shared_expert.down_proj.scales": "model-00004-of-00009.safetensors",
543
+ "model.layers.22.mlp.shared_expert.down_proj.weight": "model-00004-of-00009.safetensors",
544
+ "model.layers.22.mlp.shared_expert.gate_proj.biases": "model-00004-of-00009.safetensors",
545
+ "model.layers.22.mlp.shared_expert.gate_proj.scales": "model-00004-of-00009.safetensors",
546
+ "model.layers.22.mlp.shared_expert.gate_proj.weight": "model-00004-of-00009.safetensors",
547
+ "model.layers.22.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
548
+ "model.layers.22.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
549
+ "model.layers.22.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
550
+ "model.layers.22.mlp.shared_expert_gate.biases": "model-00004-of-00009.safetensors",
551
+ "model.layers.22.mlp.shared_expert_gate.scales": "model-00004-of-00009.safetensors",
552
+ "model.layers.22.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
553
+ "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
 
554
  "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
555
  "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00009.safetensors",
556
  "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00009.safetensors",
557
  "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00004-of-00009.safetensors",
558
  "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
559
  "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
560
+ "model.layers.23.input_layernorm.weight": "model-00004-of-00009.safetensors",
561
  "model.layers.23.mlp.gate.weight": "model-00005-of-00009.safetensors",
562
  "model.layers.23.mlp.shared_expert.down_proj.biases": "model-00005-of-00009.safetensors",
563
  "model.layers.23.mlp.shared_expert.down_proj.scales": "model-00005-of-00009.safetensors",
 
577
  "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00009.safetensors",
578
  "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
579
  "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
580
+ "model.layers.23.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
581
+ "model.layers.23.self_attn.k_norm.weight": "model-00004-of-00009.safetensors",
582
+ "model.layers.23.self_attn.k_proj.biases": "model-00004-of-00009.safetensors",
583
+ "model.layers.23.self_attn.k_proj.scales": "model-00004-of-00009.safetensors",
584
+ "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",
585
+ "model.layers.23.self_attn.o_proj.biases": "model-00004-of-00009.safetensors",
586
+ "model.layers.23.self_attn.o_proj.scales": "model-00004-of-00009.safetensors",
587
+ "model.layers.23.self_attn.o_proj.weight": "model-00004-of-00009.safetensors",
588
+ "model.layers.23.self_attn.q_norm.weight": "model-00004-of-00009.safetensors",
589
+ "model.layers.23.self_attn.q_proj.biases": "model-00004-of-00009.safetensors",
590
+ "model.layers.23.self_attn.q_proj.scales": "model-00004-of-00009.safetensors",
591
+ "model.layers.23.self_attn.q_proj.weight": "model-00004-of-00009.safetensors",
592
+ "model.layers.23.self_attn.v_proj.biases": "model-00004-of-00009.safetensors",
593
+ "model.layers.23.self_attn.v_proj.scales": "model-00004-of-00009.safetensors",
594
+ "model.layers.23.self_attn.v_proj.weight": "model-00004-of-00009.safetensors",
595
  "model.layers.24.input_layernorm.weight": "model-00005-of-00009.safetensors",
596
  "model.layers.24.linear_attn.A_log": "model-00005-of-00009.safetensors",
597
  "model.layers.24.linear_attn.conv1d.weight": "model-00005-of-00009.safetensors",
 
599
  "model.layers.24.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
600
  "model.layers.24.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
601
  "model.layers.24.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
602
+ "model.layers.24.linear_attn.in_proj_qkvz.biases": "model-00005-of-00009.safetensors",
603
  "model.layers.24.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
604
  "model.layers.24.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
605
  "model.layers.24.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
 
633
  "model.layers.25.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
634
  "model.layers.25.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
635
  "model.layers.25.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
636
+ "model.layers.25.linear_attn.in_proj_qkvz.biases": "model-00005-of-00009.safetensors",
637
  "model.layers.25.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
638
  "model.layers.25.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
639
  "model.layers.25.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
 
667
  "model.layers.26.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
668
  "model.layers.26.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
669
  "model.layers.26.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
670
+ "model.layers.26.linear_attn.in_proj_qkvz.biases": "model-00005-of-00009.safetensors",
671
  "model.layers.26.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
672
  "model.layers.26.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
673
  "model.layers.26.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
 
716
  "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
717
  "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
718
  "model.layers.27.self_attn.k_norm.weight": "model-00005-of-00009.safetensors",
719
+ "model.layers.27.self_attn.k_proj.biases": "model-00005-of-00009.safetensors",
720
  "model.layers.27.self_attn.k_proj.scales": "model-00005-of-00009.safetensors",
721
  "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
722
+ "model.layers.27.self_attn.o_proj.biases": "model-00005-of-00009.safetensors",
723
  "model.layers.27.self_attn.o_proj.scales": "model-00005-of-00009.safetensors",
724
  "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00009.safetensors",
725
  "model.layers.27.self_attn.q_norm.weight": "model-00005-of-00009.safetensors",
726
+ "model.layers.27.self_attn.q_proj.biases": "model-00005-of-00009.safetensors",
727
  "model.layers.27.self_attn.q_proj.scales": "model-00005-of-00009.safetensors",
728
  "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00009.safetensors",
729
+ "model.layers.27.self_attn.v_proj.biases": "model-00005-of-00009.safetensors",
730
  "model.layers.27.self_attn.v_proj.scales": "model-00005-of-00009.safetensors",
731
  "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00009.safetensors",
732
  "model.layers.28.input_layernorm.weight": "model-00005-of-00009.safetensors",
 
736
  "model.layers.28.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
737
  "model.layers.28.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
738
  "model.layers.28.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
739
+ "model.layers.28.linear_attn.in_proj_qkvz.biases": "model-00005-of-00009.safetensors",
740
  "model.layers.28.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
741
  "model.layers.28.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
742
  "model.layers.28.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
 
763
  "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
764
  "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
765
  "model.layers.28.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
766
+ "model.layers.29.input_layernorm.weight": "model-00006-of-00009.safetensors",
767
+ "model.layers.29.linear_attn.A_log": "model-00006-of-00009.safetensors",
768
  "model.layers.29.linear_attn.conv1d.weight": "model-00005-of-00009.safetensors",
769
+ "model.layers.29.linear_attn.dt_bias": "model-00006-of-00009.safetensors",
770
+ "model.layers.29.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
771
+ "model.layers.29.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
772
+ "model.layers.29.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
773
+ "model.layers.29.linear_attn.in_proj_qkvz.biases": "model-00006-of-00009.safetensors",
774
  "model.layers.29.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
775
  "model.layers.29.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
776
+ "model.layers.29.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
777
+ "model.layers.29.linear_attn.out_proj.biases": "model-00006-of-00009.safetensors",
778
+ "model.layers.29.linear_attn.out_proj.scales": "model-00006-of-00009.safetensors",
779
+ "model.layers.29.linear_attn.out_proj.weight": "model-00006-of-00009.safetensors",
780
+ "model.layers.29.mlp.gate.weight": "model-00006-of-00009.safetensors",
781
  "model.layers.29.mlp.shared_expert.down_proj.biases": "model-00006-of-00009.safetensors",
782
  "model.layers.29.mlp.shared_expert.down_proj.scales": "model-00006-of-00009.safetensors",
783
  "model.layers.29.mlp.shared_expert.down_proj.weight": "model-00006-of-00009.safetensors",
 
796
  "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00009.safetensors",
797
  "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
798
  "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
799
+ "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
800
  "model.layers.3.input_layernorm.weight": "model-00001-of-00009.safetensors",
801
  "model.layers.3.mlp.gate.weight": "model-00001-of-00009.safetensors",
802
  "model.layers.3.mlp.shared_expert.down_proj.biases": "model-00001-of-00009.safetensors",
 
819
  "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00009.safetensors",
820
  "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
821
  "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00009.safetensors",
822
+ "model.layers.3.self_attn.k_proj.biases": "model-00001-of-00009.safetensors",
823
  "model.layers.3.self_attn.k_proj.scales": "model-00001-of-00009.safetensors",
824
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
825
+ "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00009.safetensors",
826
  "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00009.safetensors",
827
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00009.safetensors",
828
  "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00009.safetensors",
829
+ "model.layers.3.self_attn.q_proj.biases": "model-00001-of-00009.safetensors",
830
  "model.layers.3.self_attn.q_proj.scales": "model-00001-of-00009.safetensors",
831
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00009.safetensors",
832
+ "model.layers.3.self_attn.v_proj.biases": "model-00001-of-00009.safetensors",
833
  "model.layers.3.self_attn.v_proj.scales": "model-00001-of-00009.safetensors",
834
  "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00009.safetensors",
835
  "model.layers.30.input_layernorm.weight": "model-00006-of-00009.safetensors",
 
839
  "model.layers.30.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
840
  "model.layers.30.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
841
  "model.layers.30.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
842
+ "model.layers.30.linear_attn.in_proj_qkvz.biases": "model-00006-of-00009.safetensors",
843
  "model.layers.30.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
844
  "model.layers.30.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
845
  "model.layers.30.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
 
880
  "model.layers.31.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
881
  "model.layers.31.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
882
  "model.layers.31.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
 
883
  "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
884
  "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
885
  "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00009.safetensors",
 
888
  "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
889
  "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
890
  "model.layers.31.self_attn.k_norm.weight": "model-00006-of-00009.safetensors",
891
+ "model.layers.31.self_attn.k_proj.biases": "model-00006-of-00009.safetensors",
892
  "model.layers.31.self_attn.k_proj.scales": "model-00006-of-00009.safetensors",
893
  "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
894
+ "model.layers.31.self_attn.o_proj.biases": "model-00006-of-00009.safetensors",
895
  "model.layers.31.self_attn.o_proj.scales": "model-00006-of-00009.safetensors",
896
  "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00009.safetensors",
897
  "model.layers.31.self_attn.q_norm.weight": "model-00006-of-00009.safetensors",
898
+ "model.layers.31.self_attn.q_proj.biases": "model-00006-of-00009.safetensors",
899
  "model.layers.31.self_attn.q_proj.scales": "model-00006-of-00009.safetensors",
900
  "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00009.safetensors",
901
+ "model.layers.31.self_attn.v_proj.biases": "model-00006-of-00009.safetensors",
902
  "model.layers.31.self_attn.v_proj.scales": "model-00006-of-00009.safetensors",
903
  "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00009.safetensors",
904
  "model.layers.32.input_layernorm.weight": "model-00006-of-00009.safetensors",
 
908
  "model.layers.32.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
909
  "model.layers.32.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
910
  "model.layers.32.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
911
+ "model.layers.32.linear_attn.in_proj_qkvz.biases": "model-00006-of-00009.safetensors",
912
  "model.layers.32.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
913
  "model.layers.32.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
914
  "model.layers.32.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
 
942
  "model.layers.33.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
943
  "model.layers.33.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
944
  "model.layers.33.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
945
+ "model.layers.33.linear_attn.in_proj_qkvz.biases": "model-00006-of-00009.safetensors",
946
  "model.layers.33.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
947
  "model.layers.33.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
948
  "model.layers.33.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
 
976
  "model.layers.34.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
977
  "model.layers.34.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
978
  "model.layers.34.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
979
+ "model.layers.34.linear_attn.in_proj_qkvz.biases": "model-00006-of-00009.safetensors",
980
  "model.layers.34.linear_attn.in_proj_qkvz.scales": "model-00006-of-00009.safetensors",
981
  "model.layers.34.linear_attn.in_proj_qkvz.weight": "model-00006-of-00009.safetensors",
982
  "model.layers.34.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
 
984
  "model.layers.34.linear_attn.out_proj.scales": "model-00006-of-00009.safetensors",
985
  "model.layers.34.linear_attn.out_proj.weight": "model-00006-of-00009.safetensors",
986
  "model.layers.34.mlp.gate.weight": "model-00006-of-00009.safetensors",
987
+ "model.layers.34.mlp.shared_expert.down_proj.biases": "model-00006-of-00009.safetensors",
988
+ "model.layers.34.mlp.shared_expert.down_proj.scales": "model-00006-of-00009.safetensors",
989
+ "model.layers.34.mlp.shared_expert.down_proj.weight": "model-00006-of-00009.safetensors",
990
+ "model.layers.34.mlp.shared_expert.gate_proj.biases": "model-00006-of-00009.safetensors",
991
+ "model.layers.34.mlp.shared_expert.gate_proj.scales": "model-00006-of-00009.safetensors",
992
+ "model.layers.34.mlp.shared_expert.gate_proj.weight": "model-00006-of-00009.safetensors",
993
+ "model.layers.34.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
994
+ "model.layers.34.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
995
+ "model.layers.34.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
996
+ "model.layers.34.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
997
+ "model.layers.34.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
998
+ "model.layers.34.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
999
+ "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
1000
+ "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
1001
  "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00009.safetensors",
1002
  "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00009.safetensors",
1003
  "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
 
1025
  "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
1026
  "model.layers.35.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1027
  "model.layers.35.self_attn.k_norm.weight": "model-00007-of-00009.safetensors",
1028
+ "model.layers.35.self_attn.k_proj.biases": "model-00007-of-00009.safetensors",
1029
  "model.layers.35.self_attn.k_proj.scales": "model-00007-of-00009.safetensors",
1030
  "model.layers.35.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
1031
+ "model.layers.35.self_attn.o_proj.biases": "model-00007-of-00009.safetensors",
1032
  "model.layers.35.self_attn.o_proj.scales": "model-00007-of-00009.safetensors",
1033
  "model.layers.35.self_attn.o_proj.weight": "model-00007-of-00009.safetensors",
1034
  "model.layers.35.self_attn.q_norm.weight": "model-00007-of-00009.safetensors",
1035
+ "model.layers.35.self_attn.q_proj.biases": "model-00007-of-00009.safetensors",
1036
  "model.layers.35.self_attn.q_proj.scales": "model-00007-of-00009.safetensors",
1037
  "model.layers.35.self_attn.q_proj.weight": "model-00007-of-00009.safetensors",
1038
+ "model.layers.35.self_attn.v_proj.biases": "model-00007-of-00009.safetensors",
1039
  "model.layers.35.self_attn.v_proj.scales": "model-00007-of-00009.safetensors",
1040
  "model.layers.35.self_attn.v_proj.weight": "model-00007-of-00009.safetensors",
1041
  "model.layers.36.input_layernorm.weight": "model-00007-of-00009.safetensors",
 
1045
  "model.layers.36.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1046
  "model.layers.36.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1047
  "model.layers.36.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
1048
+ "model.layers.36.linear_attn.in_proj_qkvz.biases": "model-00007-of-00009.safetensors",
1049
  "model.layers.36.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1050
  "model.layers.36.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1051
  "model.layers.36.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
 
1079
  "model.layers.37.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1080
  "model.layers.37.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1081
  "model.layers.37.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
1082
+ "model.layers.37.linear_attn.in_proj_qkvz.biases": "model-00007-of-00009.safetensors",
1083
  "model.layers.37.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1084
  "model.layers.37.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1085
  "model.layers.37.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
 
1113
  "model.layers.38.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1114
  "model.layers.38.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1115
  "model.layers.38.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
1116
+ "model.layers.38.linear_attn.in_proj_qkvz.biases": "model-00007-of-00009.safetensors",
1117
  "model.layers.38.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1118
  "model.layers.38.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1119
  "model.layers.38.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
 
1154
  "model.layers.39.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
1155
  "model.layers.39.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
1156
  "model.layers.39.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
 
1157
  "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1158
  "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
1159
  "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00009.safetensors",
 
1162
  "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
1163
  "model.layers.39.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1164
  "model.layers.39.self_attn.k_norm.weight": "model-00007-of-00009.safetensors",
1165
+ "model.layers.39.self_attn.k_proj.biases": "model-00007-of-00009.safetensors",
1166
  "model.layers.39.self_attn.k_proj.scales": "model-00007-of-00009.safetensors",
1167
  "model.layers.39.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
1168
+ "model.layers.39.self_attn.o_proj.biases": "model-00007-of-00009.safetensors",
1169
  "model.layers.39.self_attn.o_proj.scales": "model-00007-of-00009.safetensors",
1170
  "model.layers.39.self_attn.o_proj.weight": "model-00007-of-00009.safetensors",
1171
  "model.layers.39.self_attn.q_norm.weight": "model-00007-of-00009.safetensors",
1172
+ "model.layers.39.self_attn.q_proj.biases": "model-00007-of-00009.safetensors",
1173
  "model.layers.39.self_attn.q_proj.scales": "model-00007-of-00009.safetensors",
1174
  "model.layers.39.self_attn.q_proj.weight": "model-00007-of-00009.safetensors",
1175
+ "model.layers.39.self_attn.v_proj.biases": "model-00007-of-00009.safetensors",
1176
  "model.layers.39.self_attn.v_proj.scales": "model-00007-of-00009.safetensors",
1177
  "model.layers.39.self_attn.v_proj.weight": "model-00007-of-00009.safetensors",
1178
  "model.layers.4.input_layernorm.weight": "model-00001-of-00009.safetensors",
 
1182
  "model.layers.4.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
1183
  "model.layers.4.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
1184
  "model.layers.4.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
1185
+ "model.layers.4.linear_attn.in_proj_qkvz.biases": "model-00001-of-00009.safetensors",
1186
  "model.layers.4.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
1187
  "model.layers.4.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
1188
  "model.layers.4.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
 
1216
  "model.layers.40.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1217
  "model.layers.40.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1218
  "model.layers.40.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
1219
+ "model.layers.40.linear_attn.in_proj_qkvz.biases": "model-00007-of-00009.safetensors",
1220
  "model.layers.40.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1221
  "model.layers.40.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1222
  "model.layers.40.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
 
1237
  "model.layers.40.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1238
  "model.layers.40.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1239
  "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1240
+ "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
1241
  "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00009.safetensors",
1242
  "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00009.safetensors",
1243
+ "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00007-of-00009.safetensors",
1244
+ "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
1245
  "model.layers.40.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1246
  "model.layers.41.input_layernorm.weight": "model-00008-of-00009.safetensors",
1247
  "model.layers.41.linear_attn.A_log": "model-00008-of-00009.safetensors",
 
1250
  "model.layers.41.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1251
  "model.layers.41.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1252
  "model.layers.41.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
1253
+ "model.layers.41.linear_attn.in_proj_qkvz.biases": "model-00008-of-00009.safetensors",
1254
  "model.layers.41.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1255
  "model.layers.41.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1256
  "model.layers.41.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
 
1284
  "model.layers.42.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1285
  "model.layers.42.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1286
  "model.layers.42.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
1287
+ "model.layers.42.linear_attn.in_proj_qkvz.biases": "model-00008-of-00009.safetensors",
1288
  "model.layers.42.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1289
  "model.layers.42.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1290
  "model.layers.42.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
 
1304
  "model.layers.42.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1305
  "model.layers.42.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1306
  "model.layers.42.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
 
1307
  "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1308
  "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
1309
  "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
 
1325
  "model.layers.43.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1326
  "model.layers.43.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1327
  "model.layers.43.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
 
1328
  "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1329
  "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
1330
  "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
 
1353
  "model.layers.44.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1354
  "model.layers.44.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1355
  "model.layers.44.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
1356
+ "model.layers.44.linear_attn.in_proj_qkvz.biases": "model-00008-of-00009.safetensors",
1357
  "model.layers.44.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1358
  "model.layers.44.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1359
  "model.layers.44.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
 
1395
  "model.layers.45.linear_attn.out_proj.scales": "model-00008-of-00009.safetensors",
1396
  "model.layers.45.linear_attn.out_proj.weight": "model-00008-of-00009.safetensors",
1397
  "model.layers.45.mlp.gate.weight": "model-00008-of-00009.safetensors",
1398
+ "model.layers.45.mlp.shared_expert.down_proj.biases": "model-00008-of-00009.safetensors",
1399
+ "model.layers.45.mlp.shared_expert.down_proj.scales": "model-00008-of-00009.safetensors",
1400
+ "model.layers.45.mlp.shared_expert.down_proj.weight": "model-00008-of-00009.safetensors",
1401
+ "model.layers.45.mlp.shared_expert.gate_proj.biases": "model-00008-of-00009.safetensors",
1402
+ "model.layers.45.mlp.shared_expert.gate_proj.scales": "model-00008-of-00009.safetensors",
1403
+ "model.layers.45.mlp.shared_expert.gate_proj.weight": "model-00008-of-00009.safetensors",
1404
+ "model.layers.45.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1405
+ "model.layers.45.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1406
+ "model.layers.45.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1407
+ "model.layers.45.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1408
+ "model.layers.45.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1409
+ "model.layers.45.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1410
+ "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1411
+ "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
 
1412
  "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
1413
  "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00009.safetensors",
1414
  "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00008-of-00009.safetensors",
1415
  "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00008-of-00009.safetensors",
1416
  "model.layers.45.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
1417
+ "model.layers.46.input_layernorm.weight": "model-00008-of-00009.safetensors",
1418
+ "model.layers.46.linear_attn.A_log": "model-00008-of-00009.safetensors",
1419
+ "model.layers.46.linear_attn.conv1d.weight": "model-00008-of-00009.safetensors",
1420
+ "model.layers.46.linear_attn.dt_bias": "model-00008-of-00009.safetensors",
1421
+ "model.layers.46.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1422
+ "model.layers.46.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1423
+ "model.layers.46.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
1424
+ "model.layers.46.linear_attn.in_proj_qkvz.biases": "model-00008-of-00009.safetensors",
1425
+ "model.layers.46.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1426
+ "model.layers.46.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1427
+ "model.layers.46.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
1428
+ "model.layers.46.linear_attn.out_proj.biases": "model-00008-of-00009.safetensors",
1429
+ "model.layers.46.linear_attn.out_proj.scales": "model-00008-of-00009.safetensors",
1430
+ "model.layers.46.linear_attn.out_proj.weight": "model-00008-of-00009.safetensors",
1431
+ "model.layers.46.mlp.gate.weight": "model-00008-of-00009.safetensors",
1432
  "model.layers.46.mlp.shared_expert.down_proj.biases": "model-00009-of-00009.safetensors",
1433
  "model.layers.46.mlp.shared_expert.down_proj.scales": "model-00009-of-00009.safetensors",
1434
  "model.layers.46.mlp.shared_expert.down_proj.weight": "model-00009-of-00009.safetensors",
 
1441
  "model.layers.46.mlp.shared_expert_gate.biases": "model-00009-of-00009.safetensors",
1442
  "model.layers.46.mlp.shared_expert_gate.scales": "model-00009-of-00009.safetensors",
1443
  "model.layers.46.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
 
1444
  "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1445
  "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
1446
+ "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
1447
+ "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00009.safetensors",
1448
+ "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00008-of-00009.safetensors",
1449
+ "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00008-of-00009.safetensors",
1450
+ "model.layers.46.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
1451
  "model.layers.47.input_layernorm.weight": "model-00009-of-00009.safetensors",
1452
  "model.layers.47.mlp.gate.weight": "model-00009-of-00009.safetensors",
1453
  "model.layers.47.mlp.shared_expert.down_proj.biases": "model-00009-of-00009.safetensors",
 
1462
  "model.layers.47.mlp.shared_expert_gate.biases": "model-00009-of-00009.safetensors",
1463
  "model.layers.47.mlp.shared_expert_gate.scales": "model-00009-of-00009.safetensors",
1464
  "model.layers.47.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
 
1465
  "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1466
  "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
1467
  "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00009.safetensors",
 
1490
  "model.layers.5.linear_attn.in_proj_ba.biases": "model-00001-of-00009.safetensors",
1491
  "model.layers.5.linear_attn.in_proj_ba.scales": "model-00001-of-00009.safetensors",
1492
  "model.layers.5.linear_attn.in_proj_ba.weight": "model-00001-of-00009.safetensors",
1493
+ "model.layers.5.linear_attn.in_proj_qkvz.biases": "model-00001-of-00009.safetensors",
1494
  "model.layers.5.linear_attn.in_proj_qkvz.scales": "model-00001-of-00009.safetensors",
1495
  "model.layers.5.linear_attn.in_proj_qkvz.weight": "model-00001-of-00009.safetensors",
1496
  "model.layers.5.linear_attn.norm.weight": "model-00001-of-00009.safetensors",
 
1514
  "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
1515
  "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00009.safetensors",
1516
  "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00009.safetensors",
1517
+ "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00002-of-00009.safetensors",
1518
+ "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00002-of-00009.safetensors",
1519
  "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
1520
  "model.layers.6.input_layernorm.weight": "model-00002-of-00009.safetensors",
1521
  "model.layers.6.linear_attn.A_log": "model-00002-of-00009.safetensors",
 
1524
  "model.layers.6.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
1525
  "model.layers.6.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
1526
  "model.layers.6.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
1527
+ "model.layers.6.linear_attn.in_proj_qkvz.biases": "model-00002-of-00009.safetensors",
1528
  "model.layers.6.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
1529
  "model.layers.6.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
1530
  "model.layers.6.linear_attn.norm.weight": "model-00002-of-00009.safetensors",
 
1544
  "model.layers.6.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
1545
  "model.layers.6.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
1546
  "model.layers.6.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
 
1547
  "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1548
  "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
1549
  "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00009.safetensors",
 
1573
  "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00002-of-00009.safetensors",
1574
  "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
1575
  "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00009.safetensors",
1576
+ "model.layers.7.self_attn.k_proj.biases": "model-00002-of-00009.safetensors",
1577
  "model.layers.7.self_attn.k_proj.scales": "model-00002-of-00009.safetensors",
1578
  "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
1579
+ "model.layers.7.self_attn.o_proj.biases": "model-00002-of-00009.safetensors",
1580
  "model.layers.7.self_attn.o_proj.scales": "model-00002-of-00009.safetensors",
1581
  "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00009.safetensors",
1582
  "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00009.safetensors",
1583
+ "model.layers.7.self_attn.q_proj.biases": "model-00002-of-00009.safetensors",
1584
  "model.layers.7.self_attn.q_proj.scales": "model-00002-of-00009.safetensors",
1585
  "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00009.safetensors",
1586
+ "model.layers.7.self_attn.v_proj.biases": "model-00002-of-00009.safetensors",
1587
  "model.layers.7.self_attn.v_proj.scales": "model-00002-of-00009.safetensors",
1588
  "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00009.safetensors",
1589
  "model.layers.8.input_layernorm.weight": "model-00002-of-00009.safetensors",
 
1593
  "model.layers.8.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
1594
  "model.layers.8.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
1595
  "model.layers.8.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
1596
+ "model.layers.8.linear_attn.in_proj_qkvz.biases": "model-00002-of-00009.safetensors",
1597
  "model.layers.8.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
1598
  "model.layers.8.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
1599
  "model.layers.8.linear_attn.norm.weight": "model-00002-of-00009.safetensors",
 
1627
  "model.layers.9.linear_attn.in_proj_ba.biases": "model-00002-of-00009.safetensors",
1628
  "model.layers.9.linear_attn.in_proj_ba.scales": "model-00002-of-00009.safetensors",
1629
  "model.layers.9.linear_attn.in_proj_ba.weight": "model-00002-of-00009.safetensors",
1630
+ "model.layers.9.linear_attn.in_proj_qkvz.biases": "model-00002-of-00009.safetensors",
1631
  "model.layers.9.linear_attn.in_proj_qkvz.scales": "model-00002-of-00009.safetensors",
1632
  "model.layers.9.linear_attn.in_proj_qkvz.weight": "model-00002-of-00009.safetensors",
1633
  "model.layers.9.linear_attn.norm.weight": "model-00002-of-00009.safetensors",