spicyneuron commited on
Commit
10ea069
·
verified ·
1 Parent(s): 453239a

Add files using upload-large-folder tool

Browse files
config.json CHANGED
@@ -34,9 +34,9 @@
34
  "output_router_logits": false,
35
  "partial_rotary_factor": 0.25,
36
  "quantization": {
37
- "group_size": 64,
38
  "bits": 4,
39
- "mode": "affine",
40
  "model.embed_tokens": {
41
  "group_size": 64,
42
  "bits": 8,
@@ -72,6 +72,21 @@
72
  "bits": 4,
73
  "mode": "mxfp4"
74
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  "model.layers.1.linear_attn.in_proj_qkvz": {
76
  "group_size": 64,
77
  "bits": 8,
@@ -102,6 +117,21 @@
102
  "bits": 4,
103
  "mode": "mxfp4"
104
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  "model.layers.2.linear_attn.in_proj_qkvz": {
106
  "group_size": 64,
107
  "bits": 8,
@@ -132,6 +162,21 @@
132
  "bits": 4,
133
  "mode": "mxfp4"
134
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  "model.layers.3.self_attn.q_proj": {
136
  "group_size": 64,
137
  "bits": 8,
@@ -167,6 +212,21 @@
167
  "bits": 4,
168
  "mode": "mxfp4"
169
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  "model.layers.4.linear_attn.in_proj_qkvz": {
171
  "group_size": 64,
172
  "bits": 8,
@@ -197,6 +257,21 @@
197
  "bits": 4,
198
  "mode": "mxfp4"
199
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  "model.layers.5.linear_attn.in_proj_qkvz": {
201
  "group_size": 64,
202
  "bits": 8,
@@ -227,6 +302,21 @@
227
  "bits": 4,
228
  "mode": "mxfp4"
229
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  "model.layers.6.linear_attn.in_proj_qkvz": {
231
  "group_size": 64,
232
  "bits": 8,
@@ -257,6 +347,21 @@
257
  "bits": 4,
258
  "mode": "mxfp4"
259
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  "model.layers.7.self_attn.q_proj": {
261
  "group_size": 64,
262
  "bits": 8,
@@ -292,6 +397,21 @@
292
  "bits": 4,
293
  "mode": "mxfp4"
294
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  "model.layers.8.linear_attn.in_proj_qkvz": {
296
  "group_size": 64,
297
  "bits": 8,
@@ -322,6 +442,21 @@
322
  "bits": 4,
323
  "mode": "mxfp4"
324
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  "model.layers.9.linear_attn.in_proj_qkvz": {
326
  "group_size": 64,
327
  "bits": 8,
@@ -352,6 +487,21 @@
352
  "bits": 4,
353
  "mode": "mxfp4"
354
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  "model.layers.10.linear_attn.in_proj_qkvz": {
356
  "group_size": 64,
357
  "bits": 8,
@@ -382,6 +532,21 @@
382
  "bits": 4,
383
  "mode": "mxfp4"
384
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  "model.layers.11.self_attn.q_proj": {
386
  "group_size": 64,
387
  "bits": 8,
@@ -417,6 +582,21 @@
417
  "bits": 4,
418
  "mode": "mxfp4"
419
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  "model.layers.12.linear_attn.in_proj_qkvz": {
421
  "group_size": 64,
422
  "bits": 8,
@@ -447,6 +627,21 @@
447
  "bits": 4,
448
  "mode": "mxfp4"
449
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  "model.layers.13.linear_attn.in_proj_qkvz": {
451
  "group_size": 64,
452
  "bits": 8,
@@ -477,6 +672,21 @@
477
  "bits": 4,
478
  "mode": "mxfp4"
479
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  "model.layers.14.linear_attn.in_proj_qkvz": {
481
  "group_size": 64,
482
  "bits": 8,
@@ -507,6 +717,21 @@
507
  "bits": 4,
508
  "mode": "mxfp4"
509
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  "model.layers.15.self_attn.q_proj": {
511
  "group_size": 64,
512
  "bits": 8,
@@ -542,6 +767,21 @@
542
  "bits": 4,
543
  "mode": "mxfp4"
544
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  "model.layers.16.linear_attn.in_proj_qkvz": {
546
  "group_size": 64,
547
  "bits": 8,
@@ -572,6 +812,21 @@
572
  "bits": 4,
573
  "mode": "mxfp4"
574
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  "model.layers.17.linear_attn.in_proj_qkvz": {
576
  "group_size": 64,
577
  "bits": 8,
@@ -602,6 +857,21 @@
602
  "bits": 4,
603
  "mode": "mxfp4"
604
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  "model.layers.18.linear_attn.in_proj_qkvz": {
606
  "group_size": 64,
607
  "bits": 8,
@@ -632,6 +902,21 @@
632
  "bits": 4,
633
  "mode": "mxfp4"
634
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  "model.layers.19.self_attn.q_proj": {
636
  "group_size": 64,
637
  "bits": 8,
@@ -667,6 +952,21 @@
667
  "bits": 4,
668
  "mode": "mxfp4"
669
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  "model.layers.20.linear_attn.in_proj_qkvz": {
671
  "group_size": 64,
672
  "bits": 8,
@@ -697,6 +997,21 @@
697
  "bits": 4,
698
  "mode": "mxfp4"
699
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  "model.layers.21.linear_attn.in_proj_qkvz": {
701
  "group_size": 64,
702
  "bits": 8,
@@ -727,22 +1042,37 @@
727
  "bits": 4,
728
  "mode": "mxfp4"
729
  },
730
- "model.layers.22.linear_attn.in_proj_qkvz": {
731
  "group_size": 64,
732
  "bits": 8,
733
  "mode": "affine"
734
  },
735
- "model.layers.22.linear_attn.in_proj_ba": {
736
  "group_size": 64,
737
  "bits": 8,
738
  "mode": "affine"
739
  },
740
- "model.layers.22.linear_attn.out_proj": {
741
  "group_size": 64,
742
  "bits": 8,
743
  "mode": "affine"
744
  },
745
- "model.layers.22.mlp.switch_mlp.gate_proj": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  "group_size": 32,
747
  "bits": 4,
748
  "mode": "mxfp4"
@@ -757,6 +1087,21 @@
757
  "bits": 4,
758
  "mode": "mxfp4"
759
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  "model.layers.23.self_attn.q_proj": {
761
  "group_size": 64,
762
  "bits": 8,
@@ -792,6 +1137,21 @@
792
  "bits": 4,
793
  "mode": "mxfp4"
794
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  "model.layers.24.linear_attn.in_proj_qkvz": {
796
  "group_size": 64,
797
  "bits": 8,
@@ -822,6 +1182,21 @@
822
  "bits": 4,
823
  "mode": "mxfp4"
824
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
825
  "model.layers.25.linear_attn.in_proj_qkvz": {
826
  "group_size": 64,
827
  "bits": 8,
@@ -852,6 +1227,21 @@
852
  "bits": 4,
853
  "mode": "mxfp4"
854
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  "model.layers.26.linear_attn.in_proj_qkvz": {
856
  "group_size": 64,
857
  "bits": 8,
@@ -882,6 +1272,21 @@
882
  "bits": 4,
883
  "mode": "mxfp4"
884
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
885
  "model.layers.27.self_attn.q_proj": {
886
  "group_size": 64,
887
  "bits": 8,
@@ -917,6 +1322,21 @@
917
  "bits": 4,
918
  "mode": "mxfp4"
919
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
920
  "model.layers.28.linear_attn.in_proj_qkvz": {
921
  "group_size": 64,
922
  "bits": 8,
@@ -947,6 +1367,21 @@
947
  "bits": 4,
948
  "mode": "mxfp4"
949
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
  "model.layers.29.linear_attn.in_proj_qkvz": {
951
  "group_size": 64,
952
  "bits": 8,
@@ -977,6 +1412,21 @@
977
  "bits": 4,
978
  "mode": "mxfp4"
979
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
980
  "model.layers.30.linear_attn.in_proj_qkvz": {
981
  "group_size": 64,
982
  "bits": 8,
@@ -1007,6 +1457,21 @@
1007
  "bits": 4,
1008
  "mode": "mxfp4"
1009
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1010
  "model.layers.31.self_attn.q_proj": {
1011
  "group_size": 64,
1012
  "bits": 8,
@@ -1042,6 +1507,21 @@
1042
  "bits": 4,
1043
  "mode": "mxfp4"
1044
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1045
  "model.layers.32.linear_attn.in_proj_qkvz": {
1046
  "group_size": 64,
1047
  "bits": 8,
@@ -1072,6 +1552,21 @@
1072
  "bits": 4,
1073
  "mode": "mxfp4"
1074
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1075
  "model.layers.33.linear_attn.in_proj_qkvz": {
1076
  "group_size": 64,
1077
  "bits": 8,
@@ -1102,6 +1597,21 @@
1102
  "bits": 4,
1103
  "mode": "mxfp4"
1104
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1105
  "model.layers.34.linear_attn.in_proj_qkvz": {
1106
  "group_size": 64,
1107
  "bits": 8,
@@ -1132,6 +1642,21 @@
1132
  "bits": 4,
1133
  "mode": "mxfp4"
1134
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1135
  "model.layers.35.self_attn.q_proj": {
1136
  "group_size": 64,
1137
  "bits": 8,
@@ -1167,6 +1692,21 @@
1167
  "bits": 4,
1168
  "mode": "mxfp4"
1169
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1170
  "model.layers.36.linear_attn.in_proj_qkvz": {
1171
  "group_size": 64,
1172
  "bits": 8,
@@ -1197,6 +1737,21 @@
1197
  "bits": 4,
1198
  "mode": "mxfp4"
1199
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1200
  "model.layers.37.linear_attn.in_proj_qkvz": {
1201
  "group_size": 64,
1202
  "bits": 8,
@@ -1227,6 +1782,21 @@
1227
  "bits": 4,
1228
  "mode": "mxfp4"
1229
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1230
  "model.layers.38.linear_attn.in_proj_qkvz": {
1231
  "group_size": 64,
1232
  "bits": 8,
@@ -1257,6 +1827,21 @@
1257
  "bits": 4,
1258
  "mode": "mxfp4"
1259
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1260
  "model.layers.39.self_attn.q_proj": {
1261
  "group_size": 64,
1262
  "bits": 8,
@@ -1292,6 +1877,21 @@
1292
  "bits": 4,
1293
  "mode": "mxfp4"
1294
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1295
  "model.layers.40.linear_attn.in_proj_qkvz": {
1296
  "group_size": 64,
1297
  "bits": 8,
@@ -1322,6 +1922,21 @@
1322
  "bits": 4,
1323
  "mode": "mxfp4"
1324
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1325
  "model.layers.41.linear_attn.in_proj_qkvz": {
1326
  "group_size": 64,
1327
  "bits": 8,
@@ -1352,6 +1967,21 @@
1352
  "bits": 4,
1353
  "mode": "mxfp4"
1354
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1355
  "model.layers.42.linear_attn.in_proj_qkvz": {
1356
  "group_size": 64,
1357
  "bits": 8,
@@ -1382,6 +2012,21 @@
1382
  "bits": 4,
1383
  "mode": "mxfp4"
1384
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1385
  "model.layers.43.self_attn.q_proj": {
1386
  "group_size": 64,
1387
  "bits": 8,
@@ -1417,6 +2062,21 @@
1417
  "bits": 4,
1418
  "mode": "mxfp4"
1419
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1420
  "model.layers.44.linear_attn.in_proj_qkvz": {
1421
  "group_size": 64,
1422
  "bits": 8,
@@ -1447,6 +2107,21 @@
1447
  "bits": 4,
1448
  "mode": "mxfp4"
1449
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
  "model.layers.45.linear_attn.in_proj_qkvz": {
1451
  "group_size": 64,
1452
  "bits": 8,
@@ -1477,6 +2152,21 @@
1477
  "bits": 4,
1478
  "mode": "mxfp4"
1479
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1480
  "model.layers.46.linear_attn.in_proj_qkvz": {
1481
  "group_size": 64,
1482
  "bits": 8,
@@ -1507,6 +2197,21 @@
1507
  "bits": 4,
1508
  "mode": "mxfp4"
1509
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1510
  "model.layers.47.self_attn.q_proj": {
1511
  "group_size": 64,
1512
  "bits": 8,
@@ -1542,6 +2247,21 @@
1542
  "bits": 4,
1543
  "mode": "mxfp4"
1544
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1545
  "lm_head": {
1546
  "group_size": 64,
1547
  "bits": 8,
@@ -1549,9 +2269,9 @@
1549
  }
1550
  },
1551
  "quantization_config": {
1552
- "group_size": 64,
1553
  "bits": 4,
1554
- "mode": "affine",
1555
  "model.embed_tokens": {
1556
  "group_size": 64,
1557
  "bits": 8,
@@ -1587,6 +2307,21 @@
1587
  "bits": 4,
1588
  "mode": "mxfp4"
1589
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1590
  "model.layers.1.linear_attn.in_proj_qkvz": {
1591
  "group_size": 64,
1592
  "bits": 8,
@@ -1617,6 +2352,21 @@
1617
  "bits": 4,
1618
  "mode": "mxfp4"
1619
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1620
  "model.layers.2.linear_attn.in_proj_qkvz": {
1621
  "group_size": 64,
1622
  "bits": 8,
@@ -1647,6 +2397,21 @@
1647
  "bits": 4,
1648
  "mode": "mxfp4"
1649
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1650
  "model.layers.3.self_attn.q_proj": {
1651
  "group_size": 64,
1652
  "bits": 8,
@@ -1682,6 +2447,21 @@
1682
  "bits": 4,
1683
  "mode": "mxfp4"
1684
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1685
  "model.layers.4.linear_attn.in_proj_qkvz": {
1686
  "group_size": 64,
1687
  "bits": 8,
@@ -1712,6 +2492,21 @@
1712
  "bits": 4,
1713
  "mode": "mxfp4"
1714
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1715
  "model.layers.5.linear_attn.in_proj_qkvz": {
1716
  "group_size": 64,
1717
  "bits": 8,
@@ -1737,10 +2532,25 @@
1737
  "bits": 4,
1738
  "mode": "mxfp4"
1739
  },
1740
- "model.layers.5.mlp.switch_mlp.down_proj": {
1741
- "group_size": 32,
1742
- "bits": 4,
1743
- "mode": "mxfp4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1744
  },
1745
  "model.layers.6.linear_attn.in_proj_qkvz": {
1746
  "group_size": 64,
@@ -1772,6 +2582,21 @@
1772
  "bits": 4,
1773
  "mode": "mxfp4"
1774
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1775
  "model.layers.7.self_attn.q_proj": {
1776
  "group_size": 64,
1777
  "bits": 8,
@@ -1807,6 +2632,21 @@
1807
  "bits": 4,
1808
  "mode": "mxfp4"
1809
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1810
  "model.layers.8.linear_attn.in_proj_qkvz": {
1811
  "group_size": 64,
1812
  "bits": 8,
@@ -1837,6 +2677,21 @@
1837
  "bits": 4,
1838
  "mode": "mxfp4"
1839
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1840
  "model.layers.9.linear_attn.in_proj_qkvz": {
1841
  "group_size": 64,
1842
  "bits": 8,
@@ -1867,6 +2722,21 @@
1867
  "bits": 4,
1868
  "mode": "mxfp4"
1869
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1870
  "model.layers.10.linear_attn.in_proj_qkvz": {
1871
  "group_size": 64,
1872
  "bits": 8,
@@ -1897,6 +2767,21 @@
1897
  "bits": 4,
1898
  "mode": "mxfp4"
1899
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1900
  "model.layers.11.self_attn.q_proj": {
1901
  "group_size": 64,
1902
  "bits": 8,
@@ -1932,6 +2817,21 @@
1932
  "bits": 4,
1933
  "mode": "mxfp4"
1934
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1935
  "model.layers.12.linear_attn.in_proj_qkvz": {
1936
  "group_size": 64,
1937
  "bits": 8,
@@ -1962,6 +2862,21 @@
1962
  "bits": 4,
1963
  "mode": "mxfp4"
1964
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1965
  "model.layers.13.linear_attn.in_proj_qkvz": {
1966
  "group_size": 64,
1967
  "bits": 8,
@@ -1992,6 +2907,21 @@
1992
  "bits": 4,
1993
  "mode": "mxfp4"
1994
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1995
  "model.layers.14.linear_attn.in_proj_qkvz": {
1996
  "group_size": 64,
1997
  "bits": 8,
@@ -2022,6 +2952,21 @@
2022
  "bits": 4,
2023
  "mode": "mxfp4"
2024
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2025
  "model.layers.15.self_attn.q_proj": {
2026
  "group_size": 64,
2027
  "bits": 8,
@@ -2057,6 +3002,21 @@
2057
  "bits": 4,
2058
  "mode": "mxfp4"
2059
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2060
  "model.layers.16.linear_attn.in_proj_qkvz": {
2061
  "group_size": 64,
2062
  "bits": 8,
@@ -2087,6 +3047,21 @@
2087
  "bits": 4,
2088
  "mode": "mxfp4"
2089
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2090
  "model.layers.17.linear_attn.in_proj_qkvz": {
2091
  "group_size": 64,
2092
  "bits": 8,
@@ -2117,6 +3092,21 @@
2117
  "bits": 4,
2118
  "mode": "mxfp4"
2119
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2120
  "model.layers.18.linear_attn.in_proj_qkvz": {
2121
  "group_size": 64,
2122
  "bits": 8,
@@ -2147,6 +3137,21 @@
2147
  "bits": 4,
2148
  "mode": "mxfp4"
2149
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2150
  "model.layers.19.self_attn.q_proj": {
2151
  "group_size": 64,
2152
  "bits": 8,
@@ -2182,6 +3187,21 @@
2182
  "bits": 4,
2183
  "mode": "mxfp4"
2184
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2185
  "model.layers.20.linear_attn.in_proj_qkvz": {
2186
  "group_size": 64,
2187
  "bits": 8,
@@ -2212,6 +3232,21 @@
2212
  "bits": 4,
2213
  "mode": "mxfp4"
2214
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2215
  "model.layers.21.linear_attn.in_proj_qkvz": {
2216
  "group_size": 64,
2217
  "bits": 8,
@@ -2242,6 +3277,21 @@
2242
  "bits": 4,
2243
  "mode": "mxfp4"
2244
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2245
  "model.layers.22.linear_attn.in_proj_qkvz": {
2246
  "group_size": 64,
2247
  "bits": 8,
@@ -2272,6 +3322,21 @@
2272
  "bits": 4,
2273
  "mode": "mxfp4"
2274
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2275
  "model.layers.23.self_attn.q_proj": {
2276
  "group_size": 64,
2277
  "bits": 8,
@@ -2307,6 +3372,21 @@
2307
  "bits": 4,
2308
  "mode": "mxfp4"
2309
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2310
  "model.layers.24.linear_attn.in_proj_qkvz": {
2311
  "group_size": 64,
2312
  "bits": 8,
@@ -2337,6 +3417,21 @@
2337
  "bits": 4,
2338
  "mode": "mxfp4"
2339
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2340
  "model.layers.25.linear_attn.in_proj_qkvz": {
2341
  "group_size": 64,
2342
  "bits": 8,
@@ -2367,6 +3462,21 @@
2367
  "bits": 4,
2368
  "mode": "mxfp4"
2369
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2370
  "model.layers.26.linear_attn.in_proj_qkvz": {
2371
  "group_size": 64,
2372
  "bits": 8,
@@ -2397,6 +3507,21 @@
2397
  "bits": 4,
2398
  "mode": "mxfp4"
2399
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2400
  "model.layers.27.self_attn.q_proj": {
2401
  "group_size": 64,
2402
  "bits": 8,
@@ -2432,6 +3557,21 @@
2432
  "bits": 4,
2433
  "mode": "mxfp4"
2434
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2435
  "model.layers.28.linear_attn.in_proj_qkvz": {
2436
  "group_size": 64,
2437
  "bits": 8,
@@ -2462,6 +3602,21 @@
2462
  "bits": 4,
2463
  "mode": "mxfp4"
2464
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2465
  "model.layers.29.linear_attn.in_proj_qkvz": {
2466
  "group_size": 64,
2467
  "bits": 8,
@@ -2492,6 +3647,21 @@
2492
  "bits": 4,
2493
  "mode": "mxfp4"
2494
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2495
  "model.layers.30.linear_attn.in_proj_qkvz": {
2496
  "group_size": 64,
2497
  "bits": 8,
@@ -2522,6 +3692,21 @@
2522
  "bits": 4,
2523
  "mode": "mxfp4"
2524
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2525
  "model.layers.31.self_attn.q_proj": {
2526
  "group_size": 64,
2527
  "bits": 8,
@@ -2557,6 +3742,21 @@
2557
  "bits": 4,
2558
  "mode": "mxfp4"
2559
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2560
  "model.layers.32.linear_attn.in_proj_qkvz": {
2561
  "group_size": 64,
2562
  "bits": 8,
@@ -2587,6 +3787,21 @@
2587
  "bits": 4,
2588
  "mode": "mxfp4"
2589
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2590
  "model.layers.33.linear_attn.in_proj_qkvz": {
2591
  "group_size": 64,
2592
  "bits": 8,
@@ -2617,6 +3832,21 @@
2617
  "bits": 4,
2618
  "mode": "mxfp4"
2619
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2620
  "model.layers.34.linear_attn.in_proj_qkvz": {
2621
  "group_size": 64,
2622
  "bits": 8,
@@ -2647,6 +3877,21 @@
2647
  "bits": 4,
2648
  "mode": "mxfp4"
2649
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2650
  "model.layers.35.self_attn.q_proj": {
2651
  "group_size": 64,
2652
  "bits": 8,
@@ -2682,6 +3927,21 @@
2682
  "bits": 4,
2683
  "mode": "mxfp4"
2684
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2685
  "model.layers.36.linear_attn.in_proj_qkvz": {
2686
  "group_size": 64,
2687
  "bits": 8,
@@ -2712,6 +3972,21 @@
2712
  "bits": 4,
2713
  "mode": "mxfp4"
2714
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2715
  "model.layers.37.linear_attn.in_proj_qkvz": {
2716
  "group_size": 64,
2717
  "bits": 8,
@@ -2742,6 +4017,21 @@
2742
  "bits": 4,
2743
  "mode": "mxfp4"
2744
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2745
  "model.layers.38.linear_attn.in_proj_qkvz": {
2746
  "group_size": 64,
2747
  "bits": 8,
@@ -2772,6 +4062,21 @@
2772
  "bits": 4,
2773
  "mode": "mxfp4"
2774
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2775
  "model.layers.39.self_attn.q_proj": {
2776
  "group_size": 64,
2777
  "bits": 8,
@@ -2807,6 +4112,21 @@
2807
  "bits": 4,
2808
  "mode": "mxfp4"
2809
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2810
  "model.layers.40.linear_attn.in_proj_qkvz": {
2811
  "group_size": 64,
2812
  "bits": 8,
@@ -2837,6 +4157,21 @@
2837
  "bits": 4,
2838
  "mode": "mxfp4"
2839
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2840
  "model.layers.41.linear_attn.in_proj_qkvz": {
2841
  "group_size": 64,
2842
  "bits": 8,
@@ -2867,6 +4202,21 @@
2867
  "bits": 4,
2868
  "mode": "mxfp4"
2869
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2870
  "model.layers.42.linear_attn.in_proj_qkvz": {
2871
  "group_size": 64,
2872
  "bits": 8,
@@ -2897,6 +4247,21 @@
2897
  "bits": 4,
2898
  "mode": "mxfp4"
2899
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2900
  "model.layers.43.self_attn.q_proj": {
2901
  "group_size": 64,
2902
  "bits": 8,
@@ -2932,6 +4297,21 @@
2932
  "bits": 4,
2933
  "mode": "mxfp4"
2934
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2935
  "model.layers.44.linear_attn.in_proj_qkvz": {
2936
  "group_size": 64,
2937
  "bits": 8,
@@ -2962,6 +4342,21 @@
2962
  "bits": 4,
2963
  "mode": "mxfp4"
2964
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2965
  "model.layers.45.linear_attn.in_proj_qkvz": {
2966
  "group_size": 64,
2967
  "bits": 8,
@@ -2992,6 +4387,21 @@
2992
  "bits": 4,
2993
  "mode": "mxfp4"
2994
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2995
  "model.layers.46.linear_attn.in_proj_qkvz": {
2996
  "group_size": 64,
2997
  "bits": 8,
@@ -3022,6 +4432,21 @@
3022
  "bits": 4,
3023
  "mode": "mxfp4"
3024
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3025
  "model.layers.47.self_attn.q_proj": {
3026
  "group_size": 64,
3027
  "bits": 8,
@@ -3057,6 +4482,21 @@
3057
  "bits": 4,
3058
  "mode": "mxfp4"
3059
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3060
  "lm_head": {
3061
  "group_size": 64,
3062
  "bits": 8,
 
34
  "output_router_logits": false,
35
  "partial_rotary_factor": 0.25,
36
  "quantization": {
37
+ "group_size": 32,
38
  "bits": 4,
39
+ "mode": "mxfp4",
40
  "model.embed_tokens": {
41
  "group_size": 64,
42
  "bits": 8,
 
72
  "bits": 4,
73
  "mode": "mxfp4"
74
  },
75
+ "model.layers.0.mlp.shared_expert.gate_proj": {
76
+ "group_size": 64,
77
+ "bits": 8,
78
+ "mode": "affine"
79
+ },
80
+ "model.layers.0.mlp.shared_expert.down_proj": {
81
+ "group_size": 64,
82
+ "bits": 8,
83
+ "mode": "affine"
84
+ },
85
+ "model.layers.0.mlp.shared_expert.up_proj": {
86
+ "group_size": 64,
87
+ "bits": 8,
88
+ "mode": "affine"
89
+ },
90
  "model.layers.1.linear_attn.in_proj_qkvz": {
91
  "group_size": 64,
92
  "bits": 8,
 
117
  "bits": 4,
118
  "mode": "mxfp4"
119
  },
120
+ "model.layers.1.mlp.shared_expert.gate_proj": {
121
+ "group_size": 64,
122
+ "bits": 8,
123
+ "mode": "affine"
124
+ },
125
+ "model.layers.1.mlp.shared_expert.down_proj": {
126
+ "group_size": 64,
127
+ "bits": 8,
128
+ "mode": "affine"
129
+ },
130
+ "model.layers.1.mlp.shared_expert.up_proj": {
131
+ "group_size": 64,
132
+ "bits": 8,
133
+ "mode": "affine"
134
+ },
135
  "model.layers.2.linear_attn.in_proj_qkvz": {
136
  "group_size": 64,
137
  "bits": 8,
 
162
  "bits": 4,
163
  "mode": "mxfp4"
164
  },
165
+ "model.layers.2.mlp.shared_expert.gate_proj": {
166
+ "group_size": 64,
167
+ "bits": 8,
168
+ "mode": "affine"
169
+ },
170
+ "model.layers.2.mlp.shared_expert.down_proj": {
171
+ "group_size": 64,
172
+ "bits": 8,
173
+ "mode": "affine"
174
+ },
175
+ "model.layers.2.mlp.shared_expert.up_proj": {
176
+ "group_size": 64,
177
+ "bits": 8,
178
+ "mode": "affine"
179
+ },
180
  "model.layers.3.self_attn.q_proj": {
181
  "group_size": 64,
182
  "bits": 8,
 
212
  "bits": 4,
213
  "mode": "mxfp4"
214
  },
215
+ "model.layers.3.mlp.shared_expert.gate_proj": {
216
+ "group_size": 64,
217
+ "bits": 8,
218
+ "mode": "affine"
219
+ },
220
+ "model.layers.3.mlp.shared_expert.down_proj": {
221
+ "group_size": 64,
222
+ "bits": 8,
223
+ "mode": "affine"
224
+ },
225
+ "model.layers.3.mlp.shared_expert.up_proj": {
226
+ "group_size": 64,
227
+ "bits": 8,
228
+ "mode": "affine"
229
+ },
230
  "model.layers.4.linear_attn.in_proj_qkvz": {
231
  "group_size": 64,
232
  "bits": 8,
 
257
  "bits": 4,
258
  "mode": "mxfp4"
259
  },
260
+ "model.layers.4.mlp.shared_expert.gate_proj": {
261
+ "group_size": 64,
262
+ "bits": 8,
263
+ "mode": "affine"
264
+ },
265
+ "model.layers.4.mlp.shared_expert.down_proj": {
266
+ "group_size": 64,
267
+ "bits": 8,
268
+ "mode": "affine"
269
+ },
270
+ "model.layers.4.mlp.shared_expert.up_proj": {
271
+ "group_size": 64,
272
+ "bits": 8,
273
+ "mode": "affine"
274
+ },
275
  "model.layers.5.linear_attn.in_proj_qkvz": {
276
  "group_size": 64,
277
  "bits": 8,
 
302
  "bits": 4,
303
  "mode": "mxfp4"
304
  },
305
+ "model.layers.5.mlp.shared_expert.gate_proj": {
306
+ "group_size": 64,
307
+ "bits": 8,
308
+ "mode": "affine"
309
+ },
310
+ "model.layers.5.mlp.shared_expert.down_proj": {
311
+ "group_size": 64,
312
+ "bits": 8,
313
+ "mode": "affine"
314
+ },
315
+ "model.layers.5.mlp.shared_expert.up_proj": {
316
+ "group_size": 64,
317
+ "bits": 8,
318
+ "mode": "affine"
319
+ },
320
  "model.layers.6.linear_attn.in_proj_qkvz": {
321
  "group_size": 64,
322
  "bits": 8,
 
347
  "bits": 4,
348
  "mode": "mxfp4"
349
  },
350
+ "model.layers.6.mlp.shared_expert.gate_proj": {
351
+ "group_size": 64,
352
+ "bits": 8,
353
+ "mode": "affine"
354
+ },
355
+ "model.layers.6.mlp.shared_expert.down_proj": {
356
+ "group_size": 64,
357
+ "bits": 8,
358
+ "mode": "affine"
359
+ },
360
+ "model.layers.6.mlp.shared_expert.up_proj": {
361
+ "group_size": 64,
362
+ "bits": 8,
363
+ "mode": "affine"
364
+ },
365
  "model.layers.7.self_attn.q_proj": {
366
  "group_size": 64,
367
  "bits": 8,
 
397
  "bits": 4,
398
  "mode": "mxfp4"
399
  },
400
+ "model.layers.7.mlp.shared_expert.gate_proj": {
401
+ "group_size": 64,
402
+ "bits": 8,
403
+ "mode": "affine"
404
+ },
405
+ "model.layers.7.mlp.shared_expert.down_proj": {
406
+ "group_size": 64,
407
+ "bits": 8,
408
+ "mode": "affine"
409
+ },
410
+ "model.layers.7.mlp.shared_expert.up_proj": {
411
+ "group_size": 64,
412
+ "bits": 8,
413
+ "mode": "affine"
414
+ },
415
  "model.layers.8.linear_attn.in_proj_qkvz": {
416
  "group_size": 64,
417
  "bits": 8,
 
442
  "bits": 4,
443
  "mode": "mxfp4"
444
  },
445
+ "model.layers.8.mlp.shared_expert.gate_proj": {
446
+ "group_size": 64,
447
+ "bits": 8,
448
+ "mode": "affine"
449
+ },
450
+ "model.layers.8.mlp.shared_expert.down_proj": {
451
+ "group_size": 64,
452
+ "bits": 8,
453
+ "mode": "affine"
454
+ },
455
+ "model.layers.8.mlp.shared_expert.up_proj": {
456
+ "group_size": 64,
457
+ "bits": 8,
458
+ "mode": "affine"
459
+ },
460
  "model.layers.9.linear_attn.in_proj_qkvz": {
461
  "group_size": 64,
462
  "bits": 8,
 
487
  "bits": 4,
488
  "mode": "mxfp4"
489
  },
490
+ "model.layers.9.mlp.shared_expert.gate_proj": {
491
+ "group_size": 64,
492
+ "bits": 8,
493
+ "mode": "affine"
494
+ },
495
+ "model.layers.9.mlp.shared_expert.down_proj": {
496
+ "group_size": 64,
497
+ "bits": 8,
498
+ "mode": "affine"
499
+ },
500
+ "model.layers.9.mlp.shared_expert.up_proj": {
501
+ "group_size": 64,
502
+ "bits": 8,
503
+ "mode": "affine"
504
+ },
505
  "model.layers.10.linear_attn.in_proj_qkvz": {
506
  "group_size": 64,
507
  "bits": 8,
 
532
  "bits": 4,
533
  "mode": "mxfp4"
534
  },
535
+ "model.layers.10.mlp.shared_expert.gate_proj": {
536
+ "group_size": 64,
537
+ "bits": 8,
538
+ "mode": "affine"
539
+ },
540
+ "model.layers.10.mlp.shared_expert.down_proj": {
541
+ "group_size": 64,
542
+ "bits": 8,
543
+ "mode": "affine"
544
+ },
545
+ "model.layers.10.mlp.shared_expert.up_proj": {
546
+ "group_size": 64,
547
+ "bits": 8,
548
+ "mode": "affine"
549
+ },
550
  "model.layers.11.self_attn.q_proj": {
551
  "group_size": 64,
552
  "bits": 8,
 
582
  "bits": 4,
583
  "mode": "mxfp4"
584
  },
585
+ "model.layers.11.mlp.shared_expert.gate_proj": {
586
+ "group_size": 64,
587
+ "bits": 8,
588
+ "mode": "affine"
589
+ },
590
+ "model.layers.11.mlp.shared_expert.down_proj": {
591
+ "group_size": 64,
592
+ "bits": 8,
593
+ "mode": "affine"
594
+ },
595
+ "model.layers.11.mlp.shared_expert.up_proj": {
596
+ "group_size": 64,
597
+ "bits": 8,
598
+ "mode": "affine"
599
+ },
600
  "model.layers.12.linear_attn.in_proj_qkvz": {
601
  "group_size": 64,
602
  "bits": 8,
 
627
  "bits": 4,
628
  "mode": "mxfp4"
629
  },
630
+ "model.layers.12.mlp.shared_expert.gate_proj": {
631
+ "group_size": 64,
632
+ "bits": 8,
633
+ "mode": "affine"
634
+ },
635
+ "model.layers.12.mlp.shared_expert.down_proj": {
636
+ "group_size": 64,
637
+ "bits": 8,
638
+ "mode": "affine"
639
+ },
640
+ "model.layers.12.mlp.shared_expert.up_proj": {
641
+ "group_size": 64,
642
+ "bits": 8,
643
+ "mode": "affine"
644
+ },
645
  "model.layers.13.linear_attn.in_proj_qkvz": {
646
  "group_size": 64,
647
  "bits": 8,
 
672
  "bits": 4,
673
  "mode": "mxfp4"
674
  },
675
+ "model.layers.13.mlp.shared_expert.gate_proj": {
676
+ "group_size": 64,
677
+ "bits": 8,
678
+ "mode": "affine"
679
+ },
680
+ "model.layers.13.mlp.shared_expert.down_proj": {
681
+ "group_size": 64,
682
+ "bits": 8,
683
+ "mode": "affine"
684
+ },
685
+ "model.layers.13.mlp.shared_expert.up_proj": {
686
+ "group_size": 64,
687
+ "bits": 8,
688
+ "mode": "affine"
689
+ },
690
  "model.layers.14.linear_attn.in_proj_qkvz": {
691
  "group_size": 64,
692
  "bits": 8,
 
717
  "bits": 4,
718
  "mode": "mxfp4"
719
  },
720
+ "model.layers.14.mlp.shared_expert.gate_proj": {
721
+ "group_size": 64,
722
+ "bits": 8,
723
+ "mode": "affine"
724
+ },
725
+ "model.layers.14.mlp.shared_expert.down_proj": {
726
+ "group_size": 64,
727
+ "bits": 8,
728
+ "mode": "affine"
729
+ },
730
+ "model.layers.14.mlp.shared_expert.up_proj": {
731
+ "group_size": 64,
732
+ "bits": 8,
733
+ "mode": "affine"
734
+ },
735
  "model.layers.15.self_attn.q_proj": {
736
  "group_size": 64,
737
  "bits": 8,
 
767
  "bits": 4,
768
  "mode": "mxfp4"
769
  },
770
+ "model.layers.15.mlp.shared_expert.gate_proj": {
771
+ "group_size": 64,
772
+ "bits": 8,
773
+ "mode": "affine"
774
+ },
775
+ "model.layers.15.mlp.shared_expert.down_proj": {
776
+ "group_size": 64,
777
+ "bits": 8,
778
+ "mode": "affine"
779
+ },
780
+ "model.layers.15.mlp.shared_expert.up_proj": {
781
+ "group_size": 64,
782
+ "bits": 8,
783
+ "mode": "affine"
784
+ },
785
  "model.layers.16.linear_attn.in_proj_qkvz": {
786
  "group_size": 64,
787
  "bits": 8,
 
812
  "bits": 4,
813
  "mode": "mxfp4"
814
  },
815
+ "model.layers.16.mlp.shared_expert.gate_proj": {
816
+ "group_size": 64,
817
+ "bits": 8,
818
+ "mode": "affine"
819
+ },
820
+ "model.layers.16.mlp.shared_expert.down_proj": {
821
+ "group_size": 64,
822
+ "bits": 8,
823
+ "mode": "affine"
824
+ },
825
+ "model.layers.16.mlp.shared_expert.up_proj": {
826
+ "group_size": 64,
827
+ "bits": 8,
828
+ "mode": "affine"
829
+ },
830
  "model.layers.17.linear_attn.in_proj_qkvz": {
831
  "group_size": 64,
832
  "bits": 8,
 
857
  "bits": 4,
858
  "mode": "mxfp4"
859
  },
860
+ "model.layers.17.mlp.shared_expert.gate_proj": {
861
+ "group_size": 64,
862
+ "bits": 8,
863
+ "mode": "affine"
864
+ },
865
+ "model.layers.17.mlp.shared_expert.down_proj": {
866
+ "group_size": 64,
867
+ "bits": 8,
868
+ "mode": "affine"
869
+ },
870
+ "model.layers.17.mlp.shared_expert.up_proj": {
871
+ "group_size": 64,
872
+ "bits": 8,
873
+ "mode": "affine"
874
+ },
875
  "model.layers.18.linear_attn.in_proj_qkvz": {
876
  "group_size": 64,
877
  "bits": 8,
 
902
  "bits": 4,
903
  "mode": "mxfp4"
904
  },
905
+ "model.layers.18.mlp.shared_expert.gate_proj": {
906
+ "group_size": 64,
907
+ "bits": 8,
908
+ "mode": "affine"
909
+ },
910
+ "model.layers.18.mlp.shared_expert.down_proj": {
911
+ "group_size": 64,
912
+ "bits": 8,
913
+ "mode": "affine"
914
+ },
915
+ "model.layers.18.mlp.shared_expert.up_proj": {
916
+ "group_size": 64,
917
+ "bits": 8,
918
+ "mode": "affine"
919
+ },
920
  "model.layers.19.self_attn.q_proj": {
921
  "group_size": 64,
922
  "bits": 8,
 
952
  "bits": 4,
953
  "mode": "mxfp4"
954
  },
955
+ "model.layers.19.mlp.shared_expert.gate_proj": {
956
+ "group_size": 64,
957
+ "bits": 8,
958
+ "mode": "affine"
959
+ },
960
+ "model.layers.19.mlp.shared_expert.down_proj": {
961
+ "group_size": 64,
962
+ "bits": 8,
963
+ "mode": "affine"
964
+ },
965
+ "model.layers.19.mlp.shared_expert.up_proj": {
966
+ "group_size": 64,
967
+ "bits": 8,
968
+ "mode": "affine"
969
+ },
970
  "model.layers.20.linear_attn.in_proj_qkvz": {
971
  "group_size": 64,
972
  "bits": 8,
 
997
  "bits": 4,
998
  "mode": "mxfp4"
999
  },
1000
+ "model.layers.20.mlp.shared_expert.gate_proj": {
1001
+ "group_size": 64,
1002
+ "bits": 8,
1003
+ "mode": "affine"
1004
+ },
1005
+ "model.layers.20.mlp.shared_expert.down_proj": {
1006
+ "group_size": 64,
1007
+ "bits": 8,
1008
+ "mode": "affine"
1009
+ },
1010
+ "model.layers.20.mlp.shared_expert.up_proj": {
1011
+ "group_size": 64,
1012
+ "bits": 8,
1013
+ "mode": "affine"
1014
+ },
1015
  "model.layers.21.linear_attn.in_proj_qkvz": {
1016
  "group_size": 64,
1017
  "bits": 8,
 
1042
  "bits": 4,
1043
  "mode": "mxfp4"
1044
  },
1045
+ "model.layers.21.mlp.shared_expert.gate_proj": {
1046
  "group_size": 64,
1047
  "bits": 8,
1048
  "mode": "affine"
1049
  },
1050
+ "model.layers.21.mlp.shared_expert.down_proj": {
1051
  "group_size": 64,
1052
  "bits": 8,
1053
  "mode": "affine"
1054
  },
1055
+ "model.layers.21.mlp.shared_expert.up_proj": {
1056
  "group_size": 64,
1057
  "bits": 8,
1058
  "mode": "affine"
1059
  },
1060
+ "model.layers.22.linear_attn.in_proj_qkvz": {
1061
+ "group_size": 64,
1062
+ "bits": 8,
1063
+ "mode": "affine"
1064
+ },
1065
+ "model.layers.22.linear_attn.in_proj_ba": {
1066
+ "group_size": 64,
1067
+ "bits": 8,
1068
+ "mode": "affine"
1069
+ },
1070
+ "model.layers.22.linear_attn.out_proj": {
1071
+ "group_size": 64,
1072
+ "bits": 8,
1073
+ "mode": "affine"
1074
+ },
1075
+ "model.layers.22.mlp.switch_mlp.gate_proj": {
1076
  "group_size": 32,
1077
  "bits": 4,
1078
  "mode": "mxfp4"
 
1087
  "bits": 4,
1088
  "mode": "mxfp4"
1089
  },
1090
+ "model.layers.22.mlp.shared_expert.gate_proj": {
1091
+ "group_size": 64,
1092
+ "bits": 8,
1093
+ "mode": "affine"
1094
+ },
1095
+ "model.layers.22.mlp.shared_expert.down_proj": {
1096
+ "group_size": 64,
1097
+ "bits": 8,
1098
+ "mode": "affine"
1099
+ },
1100
+ "model.layers.22.mlp.shared_expert.up_proj": {
1101
+ "group_size": 64,
1102
+ "bits": 8,
1103
+ "mode": "affine"
1104
+ },
1105
  "model.layers.23.self_attn.q_proj": {
1106
  "group_size": 64,
1107
  "bits": 8,
 
1137
  "bits": 4,
1138
  "mode": "mxfp4"
1139
  },
1140
+ "model.layers.23.mlp.shared_expert.gate_proj": {
1141
+ "group_size": 64,
1142
+ "bits": 8,
1143
+ "mode": "affine"
1144
+ },
1145
+ "model.layers.23.mlp.shared_expert.down_proj": {
1146
+ "group_size": 64,
1147
+ "bits": 8,
1148
+ "mode": "affine"
1149
+ },
1150
+ "model.layers.23.mlp.shared_expert.up_proj": {
1151
+ "group_size": 64,
1152
+ "bits": 8,
1153
+ "mode": "affine"
1154
+ },
1155
  "model.layers.24.linear_attn.in_proj_qkvz": {
1156
  "group_size": 64,
1157
  "bits": 8,
 
1182
  "bits": 4,
1183
  "mode": "mxfp4"
1184
  },
1185
+ "model.layers.24.mlp.shared_expert.gate_proj": {
1186
+ "group_size": 64,
1187
+ "bits": 8,
1188
+ "mode": "affine"
1189
+ },
1190
+ "model.layers.24.mlp.shared_expert.down_proj": {
1191
+ "group_size": 64,
1192
+ "bits": 8,
1193
+ "mode": "affine"
1194
+ },
1195
+ "model.layers.24.mlp.shared_expert.up_proj": {
1196
+ "group_size": 64,
1197
+ "bits": 8,
1198
+ "mode": "affine"
1199
+ },
1200
  "model.layers.25.linear_attn.in_proj_qkvz": {
1201
  "group_size": 64,
1202
  "bits": 8,
 
1227
  "bits": 4,
1228
  "mode": "mxfp4"
1229
  },
1230
+ "model.layers.25.mlp.shared_expert.gate_proj": {
1231
+ "group_size": 64,
1232
+ "bits": 8,
1233
+ "mode": "affine"
1234
+ },
1235
+ "model.layers.25.mlp.shared_expert.down_proj": {
1236
+ "group_size": 64,
1237
+ "bits": 8,
1238
+ "mode": "affine"
1239
+ },
1240
+ "model.layers.25.mlp.shared_expert.up_proj": {
1241
+ "group_size": 64,
1242
+ "bits": 8,
1243
+ "mode": "affine"
1244
+ },
1245
  "model.layers.26.linear_attn.in_proj_qkvz": {
1246
  "group_size": 64,
1247
  "bits": 8,
 
1272
  "bits": 4,
1273
  "mode": "mxfp4"
1274
  },
1275
+ "model.layers.26.mlp.shared_expert.gate_proj": {
1276
+ "group_size": 64,
1277
+ "bits": 8,
1278
+ "mode": "affine"
1279
+ },
1280
+ "model.layers.26.mlp.shared_expert.down_proj": {
1281
+ "group_size": 64,
1282
+ "bits": 8,
1283
+ "mode": "affine"
1284
+ },
1285
+ "model.layers.26.mlp.shared_expert.up_proj": {
1286
+ "group_size": 64,
1287
+ "bits": 8,
1288
+ "mode": "affine"
1289
+ },
1290
  "model.layers.27.self_attn.q_proj": {
1291
  "group_size": 64,
1292
  "bits": 8,
 
1322
  "bits": 4,
1323
  "mode": "mxfp4"
1324
  },
1325
+ "model.layers.27.mlp.shared_expert.gate_proj": {
1326
+ "group_size": 64,
1327
+ "bits": 8,
1328
+ "mode": "affine"
1329
+ },
1330
+ "model.layers.27.mlp.shared_expert.down_proj": {
1331
+ "group_size": 64,
1332
+ "bits": 8,
1333
+ "mode": "affine"
1334
+ },
1335
+ "model.layers.27.mlp.shared_expert.up_proj": {
1336
+ "group_size": 64,
1337
+ "bits": 8,
1338
+ "mode": "affine"
1339
+ },
1340
  "model.layers.28.linear_attn.in_proj_qkvz": {
1341
  "group_size": 64,
1342
  "bits": 8,
 
1367
  "bits": 4,
1368
  "mode": "mxfp4"
1369
  },
1370
+ "model.layers.28.mlp.shared_expert.gate_proj": {
1371
+ "group_size": 64,
1372
+ "bits": 8,
1373
+ "mode": "affine"
1374
+ },
1375
+ "model.layers.28.mlp.shared_expert.down_proj": {
1376
+ "group_size": 64,
1377
+ "bits": 8,
1378
+ "mode": "affine"
1379
+ },
1380
+ "model.layers.28.mlp.shared_expert.up_proj": {
1381
+ "group_size": 64,
1382
+ "bits": 8,
1383
+ "mode": "affine"
1384
+ },
1385
  "model.layers.29.linear_attn.in_proj_qkvz": {
1386
  "group_size": 64,
1387
  "bits": 8,
 
1412
  "bits": 4,
1413
  "mode": "mxfp4"
1414
  },
1415
+ "model.layers.29.mlp.shared_expert.gate_proj": {
1416
+ "group_size": 64,
1417
+ "bits": 8,
1418
+ "mode": "affine"
1419
+ },
1420
+ "model.layers.29.mlp.shared_expert.down_proj": {
1421
+ "group_size": 64,
1422
+ "bits": 8,
1423
+ "mode": "affine"
1424
+ },
1425
+ "model.layers.29.mlp.shared_expert.up_proj": {
1426
+ "group_size": 64,
1427
+ "bits": 8,
1428
+ "mode": "affine"
1429
+ },
1430
  "model.layers.30.linear_attn.in_proj_qkvz": {
1431
  "group_size": 64,
1432
  "bits": 8,
 
1457
  "bits": 4,
1458
  "mode": "mxfp4"
1459
  },
1460
+ "model.layers.30.mlp.shared_expert.gate_proj": {
1461
+ "group_size": 64,
1462
+ "bits": 8,
1463
+ "mode": "affine"
1464
+ },
1465
+ "model.layers.30.mlp.shared_expert.down_proj": {
1466
+ "group_size": 64,
1467
+ "bits": 8,
1468
+ "mode": "affine"
1469
+ },
1470
+ "model.layers.30.mlp.shared_expert.up_proj": {
1471
+ "group_size": 64,
1472
+ "bits": 8,
1473
+ "mode": "affine"
1474
+ },
1475
  "model.layers.31.self_attn.q_proj": {
1476
  "group_size": 64,
1477
  "bits": 8,
 
1507
  "bits": 4,
1508
  "mode": "mxfp4"
1509
  },
1510
+ "model.layers.31.mlp.shared_expert.gate_proj": {
1511
+ "group_size": 64,
1512
+ "bits": 8,
1513
+ "mode": "affine"
1514
+ },
1515
+ "model.layers.31.mlp.shared_expert.down_proj": {
1516
+ "group_size": 64,
1517
+ "bits": 8,
1518
+ "mode": "affine"
1519
+ },
1520
+ "model.layers.31.mlp.shared_expert.up_proj": {
1521
+ "group_size": 64,
1522
+ "bits": 8,
1523
+ "mode": "affine"
1524
+ },
1525
  "model.layers.32.linear_attn.in_proj_qkvz": {
1526
  "group_size": 64,
1527
  "bits": 8,
 
1552
  "bits": 4,
1553
  "mode": "mxfp4"
1554
  },
1555
+ "model.layers.32.mlp.shared_expert.gate_proj": {
1556
+ "group_size": 64,
1557
+ "bits": 8,
1558
+ "mode": "affine"
1559
+ },
1560
+ "model.layers.32.mlp.shared_expert.down_proj": {
1561
+ "group_size": 64,
1562
+ "bits": 8,
1563
+ "mode": "affine"
1564
+ },
1565
+ "model.layers.32.mlp.shared_expert.up_proj": {
1566
+ "group_size": 64,
1567
+ "bits": 8,
1568
+ "mode": "affine"
1569
+ },
1570
  "model.layers.33.linear_attn.in_proj_qkvz": {
1571
  "group_size": 64,
1572
  "bits": 8,
 
1597
  "bits": 4,
1598
  "mode": "mxfp4"
1599
  },
1600
+ "model.layers.33.mlp.shared_expert.gate_proj": {
1601
+ "group_size": 64,
1602
+ "bits": 8,
1603
+ "mode": "affine"
1604
+ },
1605
+ "model.layers.33.mlp.shared_expert.down_proj": {
1606
+ "group_size": 64,
1607
+ "bits": 8,
1608
+ "mode": "affine"
1609
+ },
1610
+ "model.layers.33.mlp.shared_expert.up_proj": {
1611
+ "group_size": 64,
1612
+ "bits": 8,
1613
+ "mode": "affine"
1614
+ },
1615
  "model.layers.34.linear_attn.in_proj_qkvz": {
1616
  "group_size": 64,
1617
  "bits": 8,
 
1642
  "bits": 4,
1643
  "mode": "mxfp4"
1644
  },
1645
+ "model.layers.34.mlp.shared_expert.gate_proj": {
1646
+ "group_size": 64,
1647
+ "bits": 8,
1648
+ "mode": "affine"
1649
+ },
1650
+ "model.layers.34.mlp.shared_expert.down_proj": {
1651
+ "group_size": 64,
1652
+ "bits": 8,
1653
+ "mode": "affine"
1654
+ },
1655
+ "model.layers.34.mlp.shared_expert.up_proj": {
1656
+ "group_size": 64,
1657
+ "bits": 8,
1658
+ "mode": "affine"
1659
+ },
1660
  "model.layers.35.self_attn.q_proj": {
1661
  "group_size": 64,
1662
  "bits": 8,
 
1692
  "bits": 4,
1693
  "mode": "mxfp4"
1694
  },
1695
+ "model.layers.35.mlp.shared_expert.gate_proj": {
1696
+ "group_size": 64,
1697
+ "bits": 8,
1698
+ "mode": "affine"
1699
+ },
1700
+ "model.layers.35.mlp.shared_expert.down_proj": {
1701
+ "group_size": 64,
1702
+ "bits": 8,
1703
+ "mode": "affine"
1704
+ },
1705
+ "model.layers.35.mlp.shared_expert.up_proj": {
1706
+ "group_size": 64,
1707
+ "bits": 8,
1708
+ "mode": "affine"
1709
+ },
1710
  "model.layers.36.linear_attn.in_proj_qkvz": {
1711
  "group_size": 64,
1712
  "bits": 8,
 
1737
  "bits": 4,
1738
  "mode": "mxfp4"
1739
  },
1740
+ "model.layers.36.mlp.shared_expert.gate_proj": {
1741
+ "group_size": 64,
1742
+ "bits": 8,
1743
+ "mode": "affine"
1744
+ },
1745
+ "model.layers.36.mlp.shared_expert.down_proj": {
1746
+ "group_size": 64,
1747
+ "bits": 8,
1748
+ "mode": "affine"
1749
+ },
1750
+ "model.layers.36.mlp.shared_expert.up_proj": {
1751
+ "group_size": 64,
1752
+ "bits": 8,
1753
+ "mode": "affine"
1754
+ },
1755
  "model.layers.37.linear_attn.in_proj_qkvz": {
1756
  "group_size": 64,
1757
  "bits": 8,
 
1782
  "bits": 4,
1783
  "mode": "mxfp4"
1784
  },
1785
+ "model.layers.37.mlp.shared_expert.gate_proj": {
1786
+ "group_size": 64,
1787
+ "bits": 8,
1788
+ "mode": "affine"
1789
+ },
1790
+ "model.layers.37.mlp.shared_expert.down_proj": {
1791
+ "group_size": 64,
1792
+ "bits": 8,
1793
+ "mode": "affine"
1794
+ },
1795
+ "model.layers.37.mlp.shared_expert.up_proj": {
1796
+ "group_size": 64,
1797
+ "bits": 8,
1798
+ "mode": "affine"
1799
+ },
1800
  "model.layers.38.linear_attn.in_proj_qkvz": {
1801
  "group_size": 64,
1802
  "bits": 8,
 
1827
  "bits": 4,
1828
  "mode": "mxfp4"
1829
  },
1830
+ "model.layers.38.mlp.shared_expert.gate_proj": {
1831
+ "group_size": 64,
1832
+ "bits": 8,
1833
+ "mode": "affine"
1834
+ },
1835
+ "model.layers.38.mlp.shared_expert.down_proj": {
1836
+ "group_size": 64,
1837
+ "bits": 8,
1838
+ "mode": "affine"
1839
+ },
1840
+ "model.layers.38.mlp.shared_expert.up_proj": {
1841
+ "group_size": 64,
1842
+ "bits": 8,
1843
+ "mode": "affine"
1844
+ },
1845
  "model.layers.39.self_attn.q_proj": {
1846
  "group_size": 64,
1847
  "bits": 8,
 
1877
  "bits": 4,
1878
  "mode": "mxfp4"
1879
  },
1880
+ "model.layers.39.mlp.shared_expert.gate_proj": {
1881
+ "group_size": 64,
1882
+ "bits": 8,
1883
+ "mode": "affine"
1884
+ },
1885
+ "model.layers.39.mlp.shared_expert.down_proj": {
1886
+ "group_size": 64,
1887
+ "bits": 8,
1888
+ "mode": "affine"
1889
+ },
1890
+ "model.layers.39.mlp.shared_expert.up_proj": {
1891
+ "group_size": 64,
1892
+ "bits": 8,
1893
+ "mode": "affine"
1894
+ },
1895
  "model.layers.40.linear_attn.in_proj_qkvz": {
1896
  "group_size": 64,
1897
  "bits": 8,
 
1922
  "bits": 4,
1923
  "mode": "mxfp4"
1924
  },
1925
+ "model.layers.40.mlp.shared_expert.gate_proj": {
1926
+ "group_size": 64,
1927
+ "bits": 8,
1928
+ "mode": "affine"
1929
+ },
1930
+ "model.layers.40.mlp.shared_expert.down_proj": {
1931
+ "group_size": 64,
1932
+ "bits": 8,
1933
+ "mode": "affine"
1934
+ },
1935
+ "model.layers.40.mlp.shared_expert.up_proj": {
1936
+ "group_size": 64,
1937
+ "bits": 8,
1938
+ "mode": "affine"
1939
+ },
1940
  "model.layers.41.linear_attn.in_proj_qkvz": {
1941
  "group_size": 64,
1942
  "bits": 8,
 
1967
  "bits": 4,
1968
  "mode": "mxfp4"
1969
  },
1970
+ "model.layers.41.mlp.shared_expert.gate_proj": {
1971
+ "group_size": 64,
1972
+ "bits": 8,
1973
+ "mode": "affine"
1974
+ },
1975
+ "model.layers.41.mlp.shared_expert.down_proj": {
1976
+ "group_size": 64,
1977
+ "bits": 8,
1978
+ "mode": "affine"
1979
+ },
1980
+ "model.layers.41.mlp.shared_expert.up_proj": {
1981
+ "group_size": 64,
1982
+ "bits": 8,
1983
+ "mode": "affine"
1984
+ },
1985
  "model.layers.42.linear_attn.in_proj_qkvz": {
1986
  "group_size": 64,
1987
  "bits": 8,
 
2012
  "bits": 4,
2013
  "mode": "mxfp4"
2014
  },
2015
+ "model.layers.42.mlp.shared_expert.gate_proj": {
2016
+ "group_size": 64,
2017
+ "bits": 8,
2018
+ "mode": "affine"
2019
+ },
2020
+ "model.layers.42.mlp.shared_expert.down_proj": {
2021
+ "group_size": 64,
2022
+ "bits": 8,
2023
+ "mode": "affine"
2024
+ },
2025
+ "model.layers.42.mlp.shared_expert.up_proj": {
2026
+ "group_size": 64,
2027
+ "bits": 8,
2028
+ "mode": "affine"
2029
+ },
2030
  "model.layers.43.self_attn.q_proj": {
2031
  "group_size": 64,
2032
  "bits": 8,
 
2062
  "bits": 4,
2063
  "mode": "mxfp4"
2064
  },
2065
+ "model.layers.43.mlp.shared_expert.gate_proj": {
2066
+ "group_size": 64,
2067
+ "bits": 8,
2068
+ "mode": "affine"
2069
+ },
2070
+ "model.layers.43.mlp.shared_expert.down_proj": {
2071
+ "group_size": 64,
2072
+ "bits": 8,
2073
+ "mode": "affine"
2074
+ },
2075
+ "model.layers.43.mlp.shared_expert.up_proj": {
2076
+ "group_size": 64,
2077
+ "bits": 8,
2078
+ "mode": "affine"
2079
+ },
2080
  "model.layers.44.linear_attn.in_proj_qkvz": {
2081
  "group_size": 64,
2082
  "bits": 8,
 
2107
  "bits": 4,
2108
  "mode": "mxfp4"
2109
  },
2110
+ "model.layers.44.mlp.shared_expert.gate_proj": {
2111
+ "group_size": 64,
2112
+ "bits": 8,
2113
+ "mode": "affine"
2114
+ },
2115
+ "model.layers.44.mlp.shared_expert.down_proj": {
2116
+ "group_size": 64,
2117
+ "bits": 8,
2118
+ "mode": "affine"
2119
+ },
2120
+ "model.layers.44.mlp.shared_expert.up_proj": {
2121
+ "group_size": 64,
2122
+ "bits": 8,
2123
+ "mode": "affine"
2124
+ },
2125
  "model.layers.45.linear_attn.in_proj_qkvz": {
2126
  "group_size": 64,
2127
  "bits": 8,
 
2152
  "bits": 4,
2153
  "mode": "mxfp4"
2154
  },
2155
+ "model.layers.45.mlp.shared_expert.gate_proj": {
2156
+ "group_size": 64,
2157
+ "bits": 8,
2158
+ "mode": "affine"
2159
+ },
2160
+ "model.layers.45.mlp.shared_expert.down_proj": {
2161
+ "group_size": 64,
2162
+ "bits": 8,
2163
+ "mode": "affine"
2164
+ },
2165
+ "model.layers.45.mlp.shared_expert.up_proj": {
2166
+ "group_size": 64,
2167
+ "bits": 8,
2168
+ "mode": "affine"
2169
+ },
2170
  "model.layers.46.linear_attn.in_proj_qkvz": {
2171
  "group_size": 64,
2172
  "bits": 8,
 
2197
  "bits": 4,
2198
  "mode": "mxfp4"
2199
  },
2200
+ "model.layers.46.mlp.shared_expert.gate_proj": {
2201
+ "group_size": 64,
2202
+ "bits": 8,
2203
+ "mode": "affine"
2204
+ },
2205
+ "model.layers.46.mlp.shared_expert.down_proj": {
2206
+ "group_size": 64,
2207
+ "bits": 8,
2208
+ "mode": "affine"
2209
+ },
2210
+ "model.layers.46.mlp.shared_expert.up_proj": {
2211
+ "group_size": 64,
2212
+ "bits": 8,
2213
+ "mode": "affine"
2214
+ },
2215
  "model.layers.47.self_attn.q_proj": {
2216
  "group_size": 64,
2217
  "bits": 8,
 
2247
  "bits": 4,
2248
  "mode": "mxfp4"
2249
  },
2250
+ "model.layers.47.mlp.shared_expert.gate_proj": {
2251
+ "group_size": 64,
2252
+ "bits": 8,
2253
+ "mode": "affine"
2254
+ },
2255
+ "model.layers.47.mlp.shared_expert.down_proj": {
2256
+ "group_size": 64,
2257
+ "bits": 8,
2258
+ "mode": "affine"
2259
+ },
2260
+ "model.layers.47.mlp.shared_expert.up_proj": {
2261
+ "group_size": 64,
2262
+ "bits": 8,
2263
+ "mode": "affine"
2264
+ },
2265
  "lm_head": {
2266
  "group_size": 64,
2267
  "bits": 8,
 
2269
  }
2270
  },
2271
  "quantization_config": {
2272
+ "group_size": 32,
2273
  "bits": 4,
2274
+ "mode": "mxfp4",
2275
  "model.embed_tokens": {
2276
  "group_size": 64,
2277
  "bits": 8,
 
2307
  "bits": 4,
2308
  "mode": "mxfp4"
2309
  },
2310
+ "model.layers.0.mlp.shared_expert.gate_proj": {
2311
+ "group_size": 64,
2312
+ "bits": 8,
2313
+ "mode": "affine"
2314
+ },
2315
+ "model.layers.0.mlp.shared_expert.down_proj": {
2316
+ "group_size": 64,
2317
+ "bits": 8,
2318
+ "mode": "affine"
2319
+ },
2320
+ "model.layers.0.mlp.shared_expert.up_proj": {
2321
+ "group_size": 64,
2322
+ "bits": 8,
2323
+ "mode": "affine"
2324
+ },
2325
  "model.layers.1.linear_attn.in_proj_qkvz": {
2326
  "group_size": 64,
2327
  "bits": 8,
 
2352
  "bits": 4,
2353
  "mode": "mxfp4"
2354
  },
2355
+ "model.layers.1.mlp.shared_expert.gate_proj": {
2356
+ "group_size": 64,
2357
+ "bits": 8,
2358
+ "mode": "affine"
2359
+ },
2360
+ "model.layers.1.mlp.shared_expert.down_proj": {
2361
+ "group_size": 64,
2362
+ "bits": 8,
2363
+ "mode": "affine"
2364
+ },
2365
+ "model.layers.1.mlp.shared_expert.up_proj": {
2366
+ "group_size": 64,
2367
+ "bits": 8,
2368
+ "mode": "affine"
2369
+ },
2370
  "model.layers.2.linear_attn.in_proj_qkvz": {
2371
  "group_size": 64,
2372
  "bits": 8,
 
2397
  "bits": 4,
2398
  "mode": "mxfp4"
2399
  },
2400
+ "model.layers.2.mlp.shared_expert.gate_proj": {
2401
+ "group_size": 64,
2402
+ "bits": 8,
2403
+ "mode": "affine"
2404
+ },
2405
+ "model.layers.2.mlp.shared_expert.down_proj": {
2406
+ "group_size": 64,
2407
+ "bits": 8,
2408
+ "mode": "affine"
2409
+ },
2410
+ "model.layers.2.mlp.shared_expert.up_proj": {
2411
+ "group_size": 64,
2412
+ "bits": 8,
2413
+ "mode": "affine"
2414
+ },
2415
  "model.layers.3.self_attn.q_proj": {
2416
  "group_size": 64,
2417
  "bits": 8,
 
2447
  "bits": 4,
2448
  "mode": "mxfp4"
2449
  },
2450
+ "model.layers.3.mlp.shared_expert.gate_proj": {
2451
+ "group_size": 64,
2452
+ "bits": 8,
2453
+ "mode": "affine"
2454
+ },
2455
+ "model.layers.3.mlp.shared_expert.down_proj": {
2456
+ "group_size": 64,
2457
+ "bits": 8,
2458
+ "mode": "affine"
2459
+ },
2460
+ "model.layers.3.mlp.shared_expert.up_proj": {
2461
+ "group_size": 64,
2462
+ "bits": 8,
2463
+ "mode": "affine"
2464
+ },
2465
  "model.layers.4.linear_attn.in_proj_qkvz": {
2466
  "group_size": 64,
2467
  "bits": 8,
 
2492
  "bits": 4,
2493
  "mode": "mxfp4"
2494
  },
2495
+ "model.layers.4.mlp.shared_expert.gate_proj": {
2496
+ "group_size": 64,
2497
+ "bits": 8,
2498
+ "mode": "affine"
2499
+ },
2500
+ "model.layers.4.mlp.shared_expert.down_proj": {
2501
+ "group_size": 64,
2502
+ "bits": 8,
2503
+ "mode": "affine"
2504
+ },
2505
+ "model.layers.4.mlp.shared_expert.up_proj": {
2506
+ "group_size": 64,
2507
+ "bits": 8,
2508
+ "mode": "affine"
2509
+ },
2510
  "model.layers.5.linear_attn.in_proj_qkvz": {
2511
  "group_size": 64,
2512
  "bits": 8,
 
2532
  "bits": 4,
2533
  "mode": "mxfp4"
2534
  },
2535
+ "model.layers.5.mlp.switch_mlp.down_proj": {
2536
+ "group_size": 32,
2537
+ "bits": 4,
2538
+ "mode": "mxfp4"
2539
+ },
2540
+ "model.layers.5.mlp.shared_expert.gate_proj": {
2541
+ "group_size": 64,
2542
+ "bits": 8,
2543
+ "mode": "affine"
2544
+ },
2545
+ "model.layers.5.mlp.shared_expert.down_proj": {
2546
+ "group_size": 64,
2547
+ "bits": 8,
2548
+ "mode": "affine"
2549
+ },
2550
+ "model.layers.5.mlp.shared_expert.up_proj": {
2551
+ "group_size": 64,
2552
+ "bits": 8,
2553
+ "mode": "affine"
2554
  },
2555
  "model.layers.6.linear_attn.in_proj_qkvz": {
2556
  "group_size": 64,
 
2582
  "bits": 4,
2583
  "mode": "mxfp4"
2584
  },
2585
+ "model.layers.6.mlp.shared_expert.gate_proj": {
2586
+ "group_size": 64,
2587
+ "bits": 8,
2588
+ "mode": "affine"
2589
+ },
2590
+ "model.layers.6.mlp.shared_expert.down_proj": {
2591
+ "group_size": 64,
2592
+ "bits": 8,
2593
+ "mode": "affine"
2594
+ },
2595
+ "model.layers.6.mlp.shared_expert.up_proj": {
2596
+ "group_size": 64,
2597
+ "bits": 8,
2598
+ "mode": "affine"
2599
+ },
2600
  "model.layers.7.self_attn.q_proj": {
2601
  "group_size": 64,
2602
  "bits": 8,
 
2632
  "bits": 4,
2633
  "mode": "mxfp4"
2634
  },
2635
+ "model.layers.7.mlp.shared_expert.gate_proj": {
2636
+ "group_size": 64,
2637
+ "bits": 8,
2638
+ "mode": "affine"
2639
+ },
2640
+ "model.layers.7.mlp.shared_expert.down_proj": {
2641
+ "group_size": 64,
2642
+ "bits": 8,
2643
+ "mode": "affine"
2644
+ },
2645
+ "model.layers.7.mlp.shared_expert.up_proj": {
2646
+ "group_size": 64,
2647
+ "bits": 8,
2648
+ "mode": "affine"
2649
+ },
2650
  "model.layers.8.linear_attn.in_proj_qkvz": {
2651
  "group_size": 64,
2652
  "bits": 8,
 
2677
  "bits": 4,
2678
  "mode": "mxfp4"
2679
  },
2680
+ "model.layers.8.mlp.shared_expert.gate_proj": {
2681
+ "group_size": 64,
2682
+ "bits": 8,
2683
+ "mode": "affine"
2684
+ },
2685
+ "model.layers.8.mlp.shared_expert.down_proj": {
2686
+ "group_size": 64,
2687
+ "bits": 8,
2688
+ "mode": "affine"
2689
+ },
2690
+ "model.layers.8.mlp.shared_expert.up_proj": {
2691
+ "group_size": 64,
2692
+ "bits": 8,
2693
+ "mode": "affine"
2694
+ },
2695
  "model.layers.9.linear_attn.in_proj_qkvz": {
2696
  "group_size": 64,
2697
  "bits": 8,
 
2722
  "bits": 4,
2723
  "mode": "mxfp4"
2724
  },
2725
+ "model.layers.9.mlp.shared_expert.gate_proj": {
2726
+ "group_size": 64,
2727
+ "bits": 8,
2728
+ "mode": "affine"
2729
+ },
2730
+ "model.layers.9.mlp.shared_expert.down_proj": {
2731
+ "group_size": 64,
2732
+ "bits": 8,
2733
+ "mode": "affine"
2734
+ },
2735
+ "model.layers.9.mlp.shared_expert.up_proj": {
2736
+ "group_size": 64,
2737
+ "bits": 8,
2738
+ "mode": "affine"
2739
+ },
2740
  "model.layers.10.linear_attn.in_proj_qkvz": {
2741
  "group_size": 64,
2742
  "bits": 8,
 
2767
  "bits": 4,
2768
  "mode": "mxfp4"
2769
  },
2770
+ "model.layers.10.mlp.shared_expert.gate_proj": {
2771
+ "group_size": 64,
2772
+ "bits": 8,
2773
+ "mode": "affine"
2774
+ },
2775
+ "model.layers.10.mlp.shared_expert.down_proj": {
2776
+ "group_size": 64,
2777
+ "bits": 8,
2778
+ "mode": "affine"
2779
+ },
2780
+ "model.layers.10.mlp.shared_expert.up_proj": {
2781
+ "group_size": 64,
2782
+ "bits": 8,
2783
+ "mode": "affine"
2784
+ },
2785
  "model.layers.11.self_attn.q_proj": {
2786
  "group_size": 64,
2787
  "bits": 8,
 
2817
  "bits": 4,
2818
  "mode": "mxfp4"
2819
  },
2820
+ "model.layers.11.mlp.shared_expert.gate_proj": {
2821
+ "group_size": 64,
2822
+ "bits": 8,
2823
+ "mode": "affine"
2824
+ },
2825
+ "model.layers.11.mlp.shared_expert.down_proj": {
2826
+ "group_size": 64,
2827
+ "bits": 8,
2828
+ "mode": "affine"
2829
+ },
2830
+ "model.layers.11.mlp.shared_expert.up_proj": {
2831
+ "group_size": 64,
2832
+ "bits": 8,
2833
+ "mode": "affine"
2834
+ },
2835
  "model.layers.12.linear_attn.in_proj_qkvz": {
2836
  "group_size": 64,
2837
  "bits": 8,
 
2862
  "bits": 4,
2863
  "mode": "mxfp4"
2864
  },
2865
+ "model.layers.12.mlp.shared_expert.gate_proj": {
2866
+ "group_size": 64,
2867
+ "bits": 8,
2868
+ "mode": "affine"
2869
+ },
2870
+ "model.layers.12.mlp.shared_expert.down_proj": {
2871
+ "group_size": 64,
2872
+ "bits": 8,
2873
+ "mode": "affine"
2874
+ },
2875
+ "model.layers.12.mlp.shared_expert.up_proj": {
2876
+ "group_size": 64,
2877
+ "bits": 8,
2878
+ "mode": "affine"
2879
+ },
2880
  "model.layers.13.linear_attn.in_proj_qkvz": {
2881
  "group_size": 64,
2882
  "bits": 8,
 
2907
  "bits": 4,
2908
  "mode": "mxfp4"
2909
  },
2910
+ "model.layers.13.mlp.shared_expert.gate_proj": {
2911
+ "group_size": 64,
2912
+ "bits": 8,
2913
+ "mode": "affine"
2914
+ },
2915
+ "model.layers.13.mlp.shared_expert.down_proj": {
2916
+ "group_size": 64,
2917
+ "bits": 8,
2918
+ "mode": "affine"
2919
+ },
2920
+ "model.layers.13.mlp.shared_expert.up_proj": {
2921
+ "group_size": 64,
2922
+ "bits": 8,
2923
+ "mode": "affine"
2924
+ },
2925
  "model.layers.14.linear_attn.in_proj_qkvz": {
2926
  "group_size": 64,
2927
  "bits": 8,
 
2952
  "bits": 4,
2953
  "mode": "mxfp4"
2954
  },
2955
+ "model.layers.14.mlp.shared_expert.gate_proj": {
2956
+ "group_size": 64,
2957
+ "bits": 8,
2958
+ "mode": "affine"
2959
+ },
2960
+ "model.layers.14.mlp.shared_expert.down_proj": {
2961
+ "group_size": 64,
2962
+ "bits": 8,
2963
+ "mode": "affine"
2964
+ },
2965
+ "model.layers.14.mlp.shared_expert.up_proj": {
2966
+ "group_size": 64,
2967
+ "bits": 8,
2968
+ "mode": "affine"
2969
+ },
2970
  "model.layers.15.self_attn.q_proj": {
2971
  "group_size": 64,
2972
  "bits": 8,
 
3002
  "bits": 4,
3003
  "mode": "mxfp4"
3004
  },
3005
+ "model.layers.15.mlp.shared_expert.gate_proj": {
3006
+ "group_size": 64,
3007
+ "bits": 8,
3008
+ "mode": "affine"
3009
+ },
3010
+ "model.layers.15.mlp.shared_expert.down_proj": {
3011
+ "group_size": 64,
3012
+ "bits": 8,
3013
+ "mode": "affine"
3014
+ },
3015
+ "model.layers.15.mlp.shared_expert.up_proj": {
3016
+ "group_size": 64,
3017
+ "bits": 8,
3018
+ "mode": "affine"
3019
+ },
3020
  "model.layers.16.linear_attn.in_proj_qkvz": {
3021
  "group_size": 64,
3022
  "bits": 8,
 
3047
  "bits": 4,
3048
  "mode": "mxfp4"
3049
  },
3050
+ "model.layers.16.mlp.shared_expert.gate_proj": {
3051
+ "group_size": 64,
3052
+ "bits": 8,
3053
+ "mode": "affine"
3054
+ },
3055
+ "model.layers.16.mlp.shared_expert.down_proj": {
3056
+ "group_size": 64,
3057
+ "bits": 8,
3058
+ "mode": "affine"
3059
+ },
3060
+ "model.layers.16.mlp.shared_expert.up_proj": {
3061
+ "group_size": 64,
3062
+ "bits": 8,
3063
+ "mode": "affine"
3064
+ },
3065
  "model.layers.17.linear_attn.in_proj_qkvz": {
3066
  "group_size": 64,
3067
  "bits": 8,
 
3092
  "bits": 4,
3093
  "mode": "mxfp4"
3094
  },
3095
+ "model.layers.17.mlp.shared_expert.gate_proj": {
3096
+ "group_size": 64,
3097
+ "bits": 8,
3098
+ "mode": "affine"
3099
+ },
3100
+ "model.layers.17.mlp.shared_expert.down_proj": {
3101
+ "group_size": 64,
3102
+ "bits": 8,
3103
+ "mode": "affine"
3104
+ },
3105
+ "model.layers.17.mlp.shared_expert.up_proj": {
3106
+ "group_size": 64,
3107
+ "bits": 8,
3108
+ "mode": "affine"
3109
+ },
3110
  "model.layers.18.linear_attn.in_proj_qkvz": {
3111
  "group_size": 64,
3112
  "bits": 8,
 
3137
  "bits": 4,
3138
  "mode": "mxfp4"
3139
  },
3140
+ "model.layers.18.mlp.shared_expert.gate_proj": {
3141
+ "group_size": 64,
3142
+ "bits": 8,
3143
+ "mode": "affine"
3144
+ },
3145
+ "model.layers.18.mlp.shared_expert.down_proj": {
3146
+ "group_size": 64,
3147
+ "bits": 8,
3148
+ "mode": "affine"
3149
+ },
3150
+ "model.layers.18.mlp.shared_expert.up_proj": {
3151
+ "group_size": 64,
3152
+ "bits": 8,
3153
+ "mode": "affine"
3154
+ },
3155
  "model.layers.19.self_attn.q_proj": {
3156
  "group_size": 64,
3157
  "bits": 8,
 
3187
  "bits": 4,
3188
  "mode": "mxfp4"
3189
  },
3190
+ "model.layers.19.mlp.shared_expert.gate_proj": {
3191
+ "group_size": 64,
3192
+ "bits": 8,
3193
+ "mode": "affine"
3194
+ },
3195
+ "model.layers.19.mlp.shared_expert.down_proj": {
3196
+ "group_size": 64,
3197
+ "bits": 8,
3198
+ "mode": "affine"
3199
+ },
3200
+ "model.layers.19.mlp.shared_expert.up_proj": {
3201
+ "group_size": 64,
3202
+ "bits": 8,
3203
+ "mode": "affine"
3204
+ },
3205
  "model.layers.20.linear_attn.in_proj_qkvz": {
3206
  "group_size": 64,
3207
  "bits": 8,
 
3232
  "bits": 4,
3233
  "mode": "mxfp4"
3234
  },
3235
+ "model.layers.20.mlp.shared_expert.gate_proj": {
3236
+ "group_size": 64,
3237
+ "bits": 8,
3238
+ "mode": "affine"
3239
+ },
3240
+ "model.layers.20.mlp.shared_expert.down_proj": {
3241
+ "group_size": 64,
3242
+ "bits": 8,
3243
+ "mode": "affine"
3244
+ },
3245
+ "model.layers.20.mlp.shared_expert.up_proj": {
3246
+ "group_size": 64,
3247
+ "bits": 8,
3248
+ "mode": "affine"
3249
+ },
3250
  "model.layers.21.linear_attn.in_proj_qkvz": {
3251
  "group_size": 64,
3252
  "bits": 8,
 
3277
  "bits": 4,
3278
  "mode": "mxfp4"
3279
  },
3280
+ "model.layers.21.mlp.shared_expert.gate_proj": {
3281
+ "group_size": 64,
3282
+ "bits": 8,
3283
+ "mode": "affine"
3284
+ },
3285
+ "model.layers.21.mlp.shared_expert.down_proj": {
3286
+ "group_size": 64,
3287
+ "bits": 8,
3288
+ "mode": "affine"
3289
+ },
3290
+ "model.layers.21.mlp.shared_expert.up_proj": {
3291
+ "group_size": 64,
3292
+ "bits": 8,
3293
+ "mode": "affine"
3294
+ },
3295
  "model.layers.22.linear_attn.in_proj_qkvz": {
3296
  "group_size": 64,
3297
  "bits": 8,
 
3322
  "bits": 4,
3323
  "mode": "mxfp4"
3324
  },
3325
+ "model.layers.22.mlp.shared_expert.gate_proj": {
3326
+ "group_size": 64,
3327
+ "bits": 8,
3328
+ "mode": "affine"
3329
+ },
3330
+ "model.layers.22.mlp.shared_expert.down_proj": {
3331
+ "group_size": 64,
3332
+ "bits": 8,
3333
+ "mode": "affine"
3334
+ },
3335
+ "model.layers.22.mlp.shared_expert.up_proj": {
3336
+ "group_size": 64,
3337
+ "bits": 8,
3338
+ "mode": "affine"
3339
+ },
3340
  "model.layers.23.self_attn.q_proj": {
3341
  "group_size": 64,
3342
  "bits": 8,
 
3372
  "bits": 4,
3373
  "mode": "mxfp4"
3374
  },
3375
+ "model.layers.23.mlp.shared_expert.gate_proj": {
3376
+ "group_size": 64,
3377
+ "bits": 8,
3378
+ "mode": "affine"
3379
+ },
3380
+ "model.layers.23.mlp.shared_expert.down_proj": {
3381
+ "group_size": 64,
3382
+ "bits": 8,
3383
+ "mode": "affine"
3384
+ },
3385
+ "model.layers.23.mlp.shared_expert.up_proj": {
3386
+ "group_size": 64,
3387
+ "bits": 8,
3388
+ "mode": "affine"
3389
+ },
3390
  "model.layers.24.linear_attn.in_proj_qkvz": {
3391
  "group_size": 64,
3392
  "bits": 8,
 
3417
  "bits": 4,
3418
  "mode": "mxfp4"
3419
  },
3420
+ "model.layers.24.mlp.shared_expert.gate_proj": {
3421
+ "group_size": 64,
3422
+ "bits": 8,
3423
+ "mode": "affine"
3424
+ },
3425
+ "model.layers.24.mlp.shared_expert.down_proj": {
3426
+ "group_size": 64,
3427
+ "bits": 8,
3428
+ "mode": "affine"
3429
+ },
3430
+ "model.layers.24.mlp.shared_expert.up_proj": {
3431
+ "group_size": 64,
3432
+ "bits": 8,
3433
+ "mode": "affine"
3434
+ },
3435
  "model.layers.25.linear_attn.in_proj_qkvz": {
3436
  "group_size": 64,
3437
  "bits": 8,
 
3462
  "bits": 4,
3463
  "mode": "mxfp4"
3464
  },
3465
+ "model.layers.25.mlp.shared_expert.gate_proj": {
3466
+ "group_size": 64,
3467
+ "bits": 8,
3468
+ "mode": "affine"
3469
+ },
3470
+ "model.layers.25.mlp.shared_expert.down_proj": {
3471
+ "group_size": 64,
3472
+ "bits": 8,
3473
+ "mode": "affine"
3474
+ },
3475
+ "model.layers.25.mlp.shared_expert.up_proj": {
3476
+ "group_size": 64,
3477
+ "bits": 8,
3478
+ "mode": "affine"
3479
+ },
3480
  "model.layers.26.linear_attn.in_proj_qkvz": {
3481
  "group_size": 64,
3482
  "bits": 8,
 
3507
  "bits": 4,
3508
  "mode": "mxfp4"
3509
  },
3510
+ "model.layers.26.mlp.shared_expert.gate_proj": {
3511
+ "group_size": 64,
3512
+ "bits": 8,
3513
+ "mode": "affine"
3514
+ },
3515
+ "model.layers.26.mlp.shared_expert.down_proj": {
3516
+ "group_size": 64,
3517
+ "bits": 8,
3518
+ "mode": "affine"
3519
+ },
3520
+ "model.layers.26.mlp.shared_expert.up_proj": {
3521
+ "group_size": 64,
3522
+ "bits": 8,
3523
+ "mode": "affine"
3524
+ },
3525
  "model.layers.27.self_attn.q_proj": {
3526
  "group_size": 64,
3527
  "bits": 8,
 
3557
  "bits": 4,
3558
  "mode": "mxfp4"
3559
  },
3560
+ "model.layers.27.mlp.shared_expert.gate_proj": {
3561
+ "group_size": 64,
3562
+ "bits": 8,
3563
+ "mode": "affine"
3564
+ },
3565
+ "model.layers.27.mlp.shared_expert.down_proj": {
3566
+ "group_size": 64,
3567
+ "bits": 8,
3568
+ "mode": "affine"
3569
+ },
3570
+ "model.layers.27.mlp.shared_expert.up_proj": {
3571
+ "group_size": 64,
3572
+ "bits": 8,
3573
+ "mode": "affine"
3574
+ },
3575
  "model.layers.28.linear_attn.in_proj_qkvz": {
3576
  "group_size": 64,
3577
  "bits": 8,
 
3602
  "bits": 4,
3603
  "mode": "mxfp4"
3604
  },
3605
+ "model.layers.28.mlp.shared_expert.gate_proj": {
3606
+ "group_size": 64,
3607
+ "bits": 8,
3608
+ "mode": "affine"
3609
+ },
3610
+ "model.layers.28.mlp.shared_expert.down_proj": {
3611
+ "group_size": 64,
3612
+ "bits": 8,
3613
+ "mode": "affine"
3614
+ },
3615
+ "model.layers.28.mlp.shared_expert.up_proj": {
3616
+ "group_size": 64,
3617
+ "bits": 8,
3618
+ "mode": "affine"
3619
+ },
3620
  "model.layers.29.linear_attn.in_proj_qkvz": {
3621
  "group_size": 64,
3622
  "bits": 8,
 
3647
  "bits": 4,
3648
  "mode": "mxfp4"
3649
  },
3650
+ "model.layers.29.mlp.shared_expert.gate_proj": {
3651
+ "group_size": 64,
3652
+ "bits": 8,
3653
+ "mode": "affine"
3654
+ },
3655
+ "model.layers.29.mlp.shared_expert.down_proj": {
3656
+ "group_size": 64,
3657
+ "bits": 8,
3658
+ "mode": "affine"
3659
+ },
3660
+ "model.layers.29.mlp.shared_expert.up_proj": {
3661
+ "group_size": 64,
3662
+ "bits": 8,
3663
+ "mode": "affine"
3664
+ },
3665
  "model.layers.30.linear_attn.in_proj_qkvz": {
3666
  "group_size": 64,
3667
  "bits": 8,
 
3692
  "bits": 4,
3693
  "mode": "mxfp4"
3694
  },
3695
+ "model.layers.30.mlp.shared_expert.gate_proj": {
3696
+ "group_size": 64,
3697
+ "bits": 8,
3698
+ "mode": "affine"
3699
+ },
3700
+ "model.layers.30.mlp.shared_expert.down_proj": {
3701
+ "group_size": 64,
3702
+ "bits": 8,
3703
+ "mode": "affine"
3704
+ },
3705
+ "model.layers.30.mlp.shared_expert.up_proj": {
3706
+ "group_size": 64,
3707
+ "bits": 8,
3708
+ "mode": "affine"
3709
+ },
3710
  "model.layers.31.self_attn.q_proj": {
3711
  "group_size": 64,
3712
  "bits": 8,
 
3742
  "bits": 4,
3743
  "mode": "mxfp4"
3744
  },
3745
+ "model.layers.31.mlp.shared_expert.gate_proj": {
3746
+ "group_size": 64,
3747
+ "bits": 8,
3748
+ "mode": "affine"
3749
+ },
3750
+ "model.layers.31.mlp.shared_expert.down_proj": {
3751
+ "group_size": 64,
3752
+ "bits": 8,
3753
+ "mode": "affine"
3754
+ },
3755
+ "model.layers.31.mlp.shared_expert.up_proj": {
3756
+ "group_size": 64,
3757
+ "bits": 8,
3758
+ "mode": "affine"
3759
+ },
3760
  "model.layers.32.linear_attn.in_proj_qkvz": {
3761
  "group_size": 64,
3762
  "bits": 8,
 
3787
  "bits": 4,
3788
  "mode": "mxfp4"
3789
  },
3790
+ "model.layers.32.mlp.shared_expert.gate_proj": {
3791
+ "group_size": 64,
3792
+ "bits": 8,
3793
+ "mode": "affine"
3794
+ },
3795
+ "model.layers.32.mlp.shared_expert.down_proj": {
3796
+ "group_size": 64,
3797
+ "bits": 8,
3798
+ "mode": "affine"
3799
+ },
3800
+ "model.layers.32.mlp.shared_expert.up_proj": {
3801
+ "group_size": 64,
3802
+ "bits": 8,
3803
+ "mode": "affine"
3804
+ },
3805
  "model.layers.33.linear_attn.in_proj_qkvz": {
3806
  "group_size": 64,
3807
  "bits": 8,
 
3832
  "bits": 4,
3833
  "mode": "mxfp4"
3834
  },
3835
+ "model.layers.33.mlp.shared_expert.gate_proj": {
3836
+ "group_size": 64,
3837
+ "bits": 8,
3838
+ "mode": "affine"
3839
+ },
3840
+ "model.layers.33.mlp.shared_expert.down_proj": {
3841
+ "group_size": 64,
3842
+ "bits": 8,
3843
+ "mode": "affine"
3844
+ },
3845
+ "model.layers.33.mlp.shared_expert.up_proj": {
3846
+ "group_size": 64,
3847
+ "bits": 8,
3848
+ "mode": "affine"
3849
+ },
3850
  "model.layers.34.linear_attn.in_proj_qkvz": {
3851
  "group_size": 64,
3852
  "bits": 8,
 
3877
  "bits": 4,
3878
  "mode": "mxfp4"
3879
  },
3880
+ "model.layers.34.mlp.shared_expert.gate_proj": {
3881
+ "group_size": 64,
3882
+ "bits": 8,
3883
+ "mode": "affine"
3884
+ },
3885
+ "model.layers.34.mlp.shared_expert.down_proj": {
3886
+ "group_size": 64,
3887
+ "bits": 8,
3888
+ "mode": "affine"
3889
+ },
3890
+ "model.layers.34.mlp.shared_expert.up_proj": {
3891
+ "group_size": 64,
3892
+ "bits": 8,
3893
+ "mode": "affine"
3894
+ },
3895
  "model.layers.35.self_attn.q_proj": {
3896
  "group_size": 64,
3897
  "bits": 8,
 
3927
  "bits": 4,
3928
  "mode": "mxfp4"
3929
  },
3930
+ "model.layers.35.mlp.shared_expert.gate_proj": {
3931
+ "group_size": 64,
3932
+ "bits": 8,
3933
+ "mode": "affine"
3934
+ },
3935
+ "model.layers.35.mlp.shared_expert.down_proj": {
3936
+ "group_size": 64,
3937
+ "bits": 8,
3938
+ "mode": "affine"
3939
+ },
3940
+ "model.layers.35.mlp.shared_expert.up_proj": {
3941
+ "group_size": 64,
3942
+ "bits": 8,
3943
+ "mode": "affine"
3944
+ },
3945
  "model.layers.36.linear_attn.in_proj_qkvz": {
3946
  "group_size": 64,
3947
  "bits": 8,
 
3972
  "bits": 4,
3973
  "mode": "mxfp4"
3974
  },
3975
+ "model.layers.36.mlp.shared_expert.gate_proj": {
3976
+ "group_size": 64,
3977
+ "bits": 8,
3978
+ "mode": "affine"
3979
+ },
3980
+ "model.layers.36.mlp.shared_expert.down_proj": {
3981
+ "group_size": 64,
3982
+ "bits": 8,
3983
+ "mode": "affine"
3984
+ },
3985
+ "model.layers.36.mlp.shared_expert.up_proj": {
3986
+ "group_size": 64,
3987
+ "bits": 8,
3988
+ "mode": "affine"
3989
+ },
3990
  "model.layers.37.linear_attn.in_proj_qkvz": {
3991
  "group_size": 64,
3992
  "bits": 8,
 
4017
  "bits": 4,
4018
  "mode": "mxfp4"
4019
  },
4020
+ "model.layers.37.mlp.shared_expert.gate_proj": {
4021
+ "group_size": 64,
4022
+ "bits": 8,
4023
+ "mode": "affine"
4024
+ },
4025
+ "model.layers.37.mlp.shared_expert.down_proj": {
4026
+ "group_size": 64,
4027
+ "bits": 8,
4028
+ "mode": "affine"
4029
+ },
4030
+ "model.layers.37.mlp.shared_expert.up_proj": {
4031
+ "group_size": 64,
4032
+ "bits": 8,
4033
+ "mode": "affine"
4034
+ },
4035
  "model.layers.38.linear_attn.in_proj_qkvz": {
4036
  "group_size": 64,
4037
  "bits": 8,
 
4062
  "bits": 4,
4063
  "mode": "mxfp4"
4064
  },
4065
+ "model.layers.38.mlp.shared_expert.gate_proj": {
4066
+ "group_size": 64,
4067
+ "bits": 8,
4068
+ "mode": "affine"
4069
+ },
4070
+ "model.layers.38.mlp.shared_expert.down_proj": {
4071
+ "group_size": 64,
4072
+ "bits": 8,
4073
+ "mode": "affine"
4074
+ },
4075
+ "model.layers.38.mlp.shared_expert.up_proj": {
4076
+ "group_size": 64,
4077
+ "bits": 8,
4078
+ "mode": "affine"
4079
+ },
4080
  "model.layers.39.self_attn.q_proj": {
4081
  "group_size": 64,
4082
  "bits": 8,
 
4112
  "bits": 4,
4113
  "mode": "mxfp4"
4114
  },
4115
+ "model.layers.39.mlp.shared_expert.gate_proj": {
4116
+ "group_size": 64,
4117
+ "bits": 8,
4118
+ "mode": "affine"
4119
+ },
4120
+ "model.layers.39.mlp.shared_expert.down_proj": {
4121
+ "group_size": 64,
4122
+ "bits": 8,
4123
+ "mode": "affine"
4124
+ },
4125
+ "model.layers.39.mlp.shared_expert.up_proj": {
4126
+ "group_size": 64,
4127
+ "bits": 8,
4128
+ "mode": "affine"
4129
+ },
4130
  "model.layers.40.linear_attn.in_proj_qkvz": {
4131
  "group_size": 64,
4132
  "bits": 8,
 
4157
  "bits": 4,
4158
  "mode": "mxfp4"
4159
  },
4160
+ "model.layers.40.mlp.shared_expert.gate_proj": {
4161
+ "group_size": 64,
4162
+ "bits": 8,
4163
+ "mode": "affine"
4164
+ },
4165
+ "model.layers.40.mlp.shared_expert.down_proj": {
4166
+ "group_size": 64,
4167
+ "bits": 8,
4168
+ "mode": "affine"
4169
+ },
4170
+ "model.layers.40.mlp.shared_expert.up_proj": {
4171
+ "group_size": 64,
4172
+ "bits": 8,
4173
+ "mode": "affine"
4174
+ },
4175
  "model.layers.41.linear_attn.in_proj_qkvz": {
4176
  "group_size": 64,
4177
  "bits": 8,
 
4202
  "bits": 4,
4203
  "mode": "mxfp4"
4204
  },
4205
+ "model.layers.41.mlp.shared_expert.gate_proj": {
4206
+ "group_size": 64,
4207
+ "bits": 8,
4208
+ "mode": "affine"
4209
+ },
4210
+ "model.layers.41.mlp.shared_expert.down_proj": {
4211
+ "group_size": 64,
4212
+ "bits": 8,
4213
+ "mode": "affine"
4214
+ },
4215
+ "model.layers.41.mlp.shared_expert.up_proj": {
4216
+ "group_size": 64,
4217
+ "bits": 8,
4218
+ "mode": "affine"
4219
+ },
4220
  "model.layers.42.linear_attn.in_proj_qkvz": {
4221
  "group_size": 64,
4222
  "bits": 8,
 
4247
  "bits": 4,
4248
  "mode": "mxfp4"
4249
  },
4250
+ "model.layers.42.mlp.shared_expert.gate_proj": {
4251
+ "group_size": 64,
4252
+ "bits": 8,
4253
+ "mode": "affine"
4254
+ },
4255
+ "model.layers.42.mlp.shared_expert.down_proj": {
4256
+ "group_size": 64,
4257
+ "bits": 8,
4258
+ "mode": "affine"
4259
+ },
4260
+ "model.layers.42.mlp.shared_expert.up_proj": {
4261
+ "group_size": 64,
4262
+ "bits": 8,
4263
+ "mode": "affine"
4264
+ },
4265
  "model.layers.43.self_attn.q_proj": {
4266
  "group_size": 64,
4267
  "bits": 8,
 
4297
  "bits": 4,
4298
  "mode": "mxfp4"
4299
  },
4300
+ "model.layers.43.mlp.shared_expert.gate_proj": {
4301
+ "group_size": 64,
4302
+ "bits": 8,
4303
+ "mode": "affine"
4304
+ },
4305
+ "model.layers.43.mlp.shared_expert.down_proj": {
4306
+ "group_size": 64,
4307
+ "bits": 8,
4308
+ "mode": "affine"
4309
+ },
4310
+ "model.layers.43.mlp.shared_expert.up_proj": {
4311
+ "group_size": 64,
4312
+ "bits": 8,
4313
+ "mode": "affine"
4314
+ },
4315
  "model.layers.44.linear_attn.in_proj_qkvz": {
4316
  "group_size": 64,
4317
  "bits": 8,
 
4342
  "bits": 4,
4343
  "mode": "mxfp4"
4344
  },
4345
+ "model.layers.44.mlp.shared_expert.gate_proj": {
4346
+ "group_size": 64,
4347
+ "bits": 8,
4348
+ "mode": "affine"
4349
+ },
4350
+ "model.layers.44.mlp.shared_expert.down_proj": {
4351
+ "group_size": 64,
4352
+ "bits": 8,
4353
+ "mode": "affine"
4354
+ },
4355
+ "model.layers.44.mlp.shared_expert.up_proj": {
4356
+ "group_size": 64,
4357
+ "bits": 8,
4358
+ "mode": "affine"
4359
+ },
4360
  "model.layers.45.linear_attn.in_proj_qkvz": {
4361
  "group_size": 64,
4362
  "bits": 8,
 
4387
  "bits": 4,
4388
  "mode": "mxfp4"
4389
  },
4390
+ "model.layers.45.mlp.shared_expert.gate_proj": {
4391
+ "group_size": 64,
4392
+ "bits": 8,
4393
+ "mode": "affine"
4394
+ },
4395
+ "model.layers.45.mlp.shared_expert.down_proj": {
4396
+ "group_size": 64,
4397
+ "bits": 8,
4398
+ "mode": "affine"
4399
+ },
4400
+ "model.layers.45.mlp.shared_expert.up_proj": {
4401
+ "group_size": 64,
4402
+ "bits": 8,
4403
+ "mode": "affine"
4404
+ },
4405
  "model.layers.46.linear_attn.in_proj_qkvz": {
4406
  "group_size": 64,
4407
  "bits": 8,
 
4432
  "bits": 4,
4433
  "mode": "mxfp4"
4434
  },
4435
+ "model.layers.46.mlp.shared_expert.gate_proj": {
4436
+ "group_size": 64,
4437
+ "bits": 8,
4438
+ "mode": "affine"
4439
+ },
4440
+ "model.layers.46.mlp.shared_expert.down_proj": {
4441
+ "group_size": 64,
4442
+ "bits": 8,
4443
+ "mode": "affine"
4444
+ },
4445
+ "model.layers.46.mlp.shared_expert.up_proj": {
4446
+ "group_size": 64,
4447
+ "bits": 8,
4448
+ "mode": "affine"
4449
+ },
4450
  "model.layers.47.self_attn.q_proj": {
4451
  "group_size": 64,
4452
  "bits": 8,
 
4482
  "bits": 4,
4483
  "mode": "mxfp4"
4484
  },
4485
+ "model.layers.47.mlp.shared_expert.gate_proj": {
4486
+ "group_size": 64,
4487
+ "bits": 8,
4488
+ "mode": "affine"
4489
+ },
4490
+ "model.layers.47.mlp.shared_expert.down_proj": {
4491
+ "group_size": 64,
4492
+ "bits": 8,
4493
+ "mode": "affine"
4494
+ },
4495
+ "model.layers.47.mlp.shared_expert.up_proj": {
4496
+ "group_size": 64,
4497
+ "bits": 8,
4498
+ "mode": "affine"
4499
+ },
4500
  "lm_head": {
4501
  "group_size": 64,
4502
  "bits": 8,
model-00001-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a19555cb848e8db762bcd0423f35fd9b6fa3e2ef0f9daea337f8a378051b949
3
- size 5136685826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2024fb443cbc42991f1b0e599cc023050b194d5f09b3d87586464b48e8c34320
3
+ size 5131653128
model-00002-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05d512b2820924fe14ae37f6c469d91147e8acbc8cf087ac5d3c7fcc476d9db2
3
- size 5354539238
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b1b9256e4fd01e45ee40801340b8b7150047daf24822b943c81ee595e10d842
3
+ size 5367924965
model-00003-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c63f680db9da22a4651ec48e31ee47385d08fafdb87460ee0a152e5210314b9f
3
- size 5109835354
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df106317f9a4285b347e2ed95d5a2f1615cd19a137764f70841c72b9b24eb5ce
3
+ size 5358036471
model-00004-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08458af2bee021aec677bd00f0ffcabd6c24cd111f7233d16b605c1f6176a5bb
3
- size 5367122254
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f3d496f8e61aaf132e9bb7c03fb710e1089e452f035c833de9961d0dd0e137d
3
+ size 5367925082
model-00005-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:397a531b233251105ce94cfef0f24712d16b3615791cb1da0e2d732760b4bb27
3
- size 5368422424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c7fc16070ef760ae03df095948a8e069e3f4bfc1d919658595f6b85ee3304c
3
+ size 5106378225
model-00006-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f60cf313663a1ab19ba9f384bee4dceee9c82a90ea9693cab5ccdb3e615087d
3
- size 5352187259
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d673eac7cfed394f114f9831c337de195e669a52b18c44470e2a2e2ea8d6d8f5
3
+ size 5367924994
model-00007-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f75b6cf275b49b7e26b762634e190109c6829320f915fbbfd30083564d3e3e6c
3
- size 5352767299
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c28d7a730fbd540f0afa1f5a782d54b37c9ff34765601ade354a421635455be
3
+ size 5363794885
model-00008-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbdee11a1b04dd2669539ef1b7d73536923733a7a3a6b8900270a056c8477f5a
3
- size 5109835416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d47a37e53e70835b633d1e044262dcd100aae75d4880d14ca9e7cea2385e9eac
3
+ size 5367923842
model-00009-of-00009.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65acab9e2eea14867519ee414621656e5e17bc8a5fb63a34059fad88fa7159da
3
- size 1508185434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:255c562d60a4d76022fabf121f9a4e4f2e5d8658cc443610b0173261e86d1d2c
3
+ size 1200623808
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 43659382272,
4
  "total_parameters": 79674388992
5
  },
6
  "weight_map": {
@@ -34,8 +34,6 @@
34
  "model.layers.0.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
35
  "model.layers.0.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
36
  "model.layers.0.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
37
- "model.layers.0.mlp.shared_expert_gate.biases": "model-00001-of-00009.safetensors",
38
- "model.layers.0.mlp.shared_expert_gate.scales": "model-00001-of-00009.safetensors",
39
  "model.layers.0.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
40
  "model.layers.0.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
41
  "model.layers.0.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
@@ -68,8 +66,6 @@
68
  "model.layers.1.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
69
  "model.layers.1.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
70
  "model.layers.1.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
71
- "model.layers.1.mlp.shared_expert_gate.biases": "model-00001-of-00009.safetensors",
72
- "model.layers.1.mlp.shared_expert_gate.scales": "model-00001-of-00009.safetensors",
73
  "model.layers.1.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
74
  "model.layers.1.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
75
  "model.layers.1.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
@@ -102,8 +98,6 @@
102
  "model.layers.10.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
103
  "model.layers.10.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
104
  "model.layers.10.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
105
- "model.layers.10.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
106
- "model.layers.10.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
107
  "model.layers.10.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
108
  "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
109
  "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
@@ -123,12 +117,10 @@
123
  "model.layers.11.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
124
  "model.layers.11.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
125
  "model.layers.11.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
126
- "model.layers.11.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
127
- "model.layers.11.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
128
  "model.layers.11.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
129
  "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
130
  "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
131
- "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00009.safetensors",
132
  "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00009.safetensors",
133
  "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00003-of-00009.safetensors",
134
  "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00003-of-00009.safetensors",
@@ -171,8 +163,6 @@
171
  "model.layers.12.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
172
  "model.layers.12.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
173
  "model.layers.12.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
174
- "model.layers.12.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
175
- "model.layers.12.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
176
  "model.layers.12.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
177
  "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
178
  "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
@@ -205,8 +195,6 @@
205
  "model.layers.13.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
206
  "model.layers.13.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
207
  "model.layers.13.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
208
- "model.layers.13.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
209
- "model.layers.13.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
210
  "model.layers.13.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
211
  "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
212
  "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
@@ -239,8 +227,6 @@
239
  "model.layers.14.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
240
  "model.layers.14.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
241
  "model.layers.14.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
242
- "model.layers.14.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
243
- "model.layers.14.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
244
  "model.layers.14.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
245
  "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
246
  "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
@@ -260,8 +246,6 @@
260
  "model.layers.15.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
261
  "model.layers.15.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
262
  "model.layers.15.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
263
- "model.layers.15.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
264
- "model.layers.15.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
265
  "model.layers.15.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
266
  "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
267
  "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
@@ -308,8 +292,6 @@
308
  "model.layers.16.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
309
  "model.layers.16.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
310
  "model.layers.16.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
311
- "model.layers.16.mlp.shared_expert_gate.biases": "model-00003-of-00009.safetensors",
312
- "model.layers.16.mlp.shared_expert_gate.scales": "model-00003-of-00009.safetensors",
313
  "model.layers.16.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
314
  "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
315
  "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
@@ -342,13 +324,11 @@
342
  "model.layers.17.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
343
  "model.layers.17.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
344
  "model.layers.17.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
345
- "model.layers.17.mlp.shared_expert_gate.biases": "model-00004-of-00009.safetensors",
346
- "model.layers.17.mlp.shared_expert_gate.scales": "model-00004-of-00009.safetensors",
347
  "model.layers.17.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
348
  "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
349
  "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
350
  "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00009.safetensors",
351
- "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00009.safetensors",
352
  "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00004-of-00009.safetensors",
353
  "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
354
  "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
@@ -376,8 +356,6 @@
376
  "model.layers.18.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
377
  "model.layers.18.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
378
  "model.layers.18.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
379
- "model.layers.18.mlp.shared_expert_gate.biases": "model-00004-of-00009.safetensors",
380
- "model.layers.18.mlp.shared_expert_gate.scales": "model-00004-of-00009.safetensors",
381
  "model.layers.18.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
382
  "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
383
  "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
@@ -397,8 +375,6 @@
397
  "model.layers.19.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
398
  "model.layers.19.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
399
  "model.layers.19.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
400
- "model.layers.19.mlp.shared_expert_gate.biases": "model-00004-of-00009.safetensors",
401
- "model.layers.19.mlp.shared_expert_gate.scales": "model-00004-of-00009.safetensors",
402
  "model.layers.19.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
403
  "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
404
  "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
@@ -445,8 +421,6 @@
445
  "model.layers.2.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
446
  "model.layers.2.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
447
  "model.layers.2.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
448
- "model.layers.2.mlp.shared_expert_gate.biases": "model-00001-of-00009.safetensors",
449
- "model.layers.2.mlp.shared_expert_gate.scales": "model-00001-of-00009.safetensors",
450
  "model.layers.2.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
451
  "model.layers.2.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
452
  "model.layers.2.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
@@ -479,8 +453,6 @@
479
  "model.layers.20.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
480
  "model.layers.20.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
481
  "model.layers.20.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
482
- "model.layers.20.mlp.shared_expert_gate.biases": "model-00004-of-00009.safetensors",
483
- "model.layers.20.mlp.shared_expert_gate.scales": "model-00004-of-00009.safetensors",
484
  "model.layers.20.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
485
  "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
486
  "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
@@ -513,8 +485,6 @@
513
  "model.layers.21.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
514
  "model.layers.21.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
515
  "model.layers.21.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
516
- "model.layers.21.mlp.shared_expert_gate.biases": "model-00004-of-00009.safetensors",
517
- "model.layers.21.mlp.shared_expert_gate.scales": "model-00004-of-00009.safetensors",
518
  "model.layers.21.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
519
  "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
520
  "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
@@ -547,8 +517,6 @@
547
  "model.layers.22.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
548
  "model.layers.22.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
549
  "model.layers.22.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
550
- "model.layers.22.mlp.shared_expert_gate.biases": "model-00004-of-00009.safetensors",
551
- "model.layers.22.mlp.shared_expert_gate.scales": "model-00004-of-00009.safetensors",
552
  "model.layers.22.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
553
  "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
554
  "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
@@ -558,7 +526,7 @@
558
  "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
559
  "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
560
  "model.layers.23.input_layernorm.weight": "model-00004-of-00009.safetensors",
561
- "model.layers.23.mlp.gate.weight": "model-00005-of-00009.safetensors",
562
  "model.layers.23.mlp.shared_expert.down_proj.biases": "model-00005-of-00009.safetensors",
563
  "model.layers.23.mlp.shared_expert.down_proj.scales": "model-00005-of-00009.safetensors",
564
  "model.layers.23.mlp.shared_expert.down_proj.weight": "model-00005-of-00009.safetensors",
@@ -568,13 +536,11 @@
568
  "model.layers.23.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
569
  "model.layers.23.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
570
  "model.layers.23.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
571
- "model.layers.23.mlp.shared_expert_gate.biases": "model-00005-of-00009.safetensors",
572
- "model.layers.23.mlp.shared_expert_gate.scales": "model-00005-of-00009.safetensors",
573
  "model.layers.23.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
574
  "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
575
  "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
576
  "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00009.safetensors",
577
- "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00009.safetensors",
578
  "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
579
  "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
580
  "model.layers.23.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
@@ -616,8 +582,6 @@
616
  "model.layers.24.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
617
  "model.layers.24.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
618
  "model.layers.24.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
619
- "model.layers.24.mlp.shared_expert_gate.biases": "model-00005-of-00009.safetensors",
620
- "model.layers.24.mlp.shared_expert_gate.scales": "model-00005-of-00009.safetensors",
621
  "model.layers.24.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
622
  "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
623
  "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
@@ -650,8 +614,6 @@
650
  "model.layers.25.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
651
  "model.layers.25.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
652
  "model.layers.25.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
653
- "model.layers.25.mlp.shared_expert_gate.biases": "model-00005-of-00009.safetensors",
654
- "model.layers.25.mlp.shared_expert_gate.scales": "model-00005-of-00009.safetensors",
655
  "model.layers.25.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
656
  "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
657
  "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
@@ -684,8 +646,6 @@
684
  "model.layers.26.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
685
  "model.layers.26.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
686
  "model.layers.26.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
687
- "model.layers.26.mlp.shared_expert_gate.biases": "model-00005-of-00009.safetensors",
688
- "model.layers.26.mlp.shared_expert_gate.scales": "model-00005-of-00009.safetensors",
689
  "model.layers.26.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
690
  "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
691
  "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
@@ -705,8 +665,6 @@
705
  "model.layers.27.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
706
  "model.layers.27.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
707
  "model.layers.27.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
708
- "model.layers.27.mlp.shared_expert_gate.biases": "model-00005-of-00009.safetensors",
709
- "model.layers.27.mlp.shared_expert_gate.scales": "model-00005-of-00009.safetensors",
710
  "model.layers.27.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
711
  "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
712
  "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
@@ -753,8 +711,6 @@
753
  "model.layers.28.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
754
  "model.layers.28.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
755
  "model.layers.28.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
756
- "model.layers.28.mlp.shared_expert_gate.biases": "model-00005-of-00009.safetensors",
757
- "model.layers.28.mlp.shared_expert_gate.scales": "model-00005-of-00009.safetensors",
758
  "model.layers.28.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
759
  "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
760
  "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
@@ -763,21 +719,21 @@
763
  "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
764
  "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
765
  "model.layers.28.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
766
- "model.layers.29.input_layernorm.weight": "model-00006-of-00009.safetensors",
767
- "model.layers.29.linear_attn.A_log": "model-00006-of-00009.safetensors",
768
  "model.layers.29.linear_attn.conv1d.weight": "model-00005-of-00009.safetensors",
769
- "model.layers.29.linear_attn.dt_bias": "model-00006-of-00009.safetensors",
770
- "model.layers.29.linear_attn.in_proj_ba.biases": "model-00006-of-00009.safetensors",
771
- "model.layers.29.linear_attn.in_proj_ba.scales": "model-00006-of-00009.safetensors",
772
- "model.layers.29.linear_attn.in_proj_ba.weight": "model-00006-of-00009.safetensors",
773
- "model.layers.29.linear_attn.in_proj_qkvz.biases": "model-00006-of-00009.safetensors",
774
  "model.layers.29.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
775
  "model.layers.29.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
776
- "model.layers.29.linear_attn.norm.weight": "model-00006-of-00009.safetensors",
777
- "model.layers.29.linear_attn.out_proj.biases": "model-00006-of-00009.safetensors",
778
- "model.layers.29.linear_attn.out_proj.scales": "model-00006-of-00009.safetensors",
779
- "model.layers.29.linear_attn.out_proj.weight": "model-00006-of-00009.safetensors",
780
- "model.layers.29.mlp.gate.weight": "model-00006-of-00009.safetensors",
781
  "model.layers.29.mlp.shared_expert.down_proj.biases": "model-00006-of-00009.safetensors",
782
  "model.layers.29.mlp.shared_expert.down_proj.scales": "model-00006-of-00009.safetensors",
783
  "model.layers.29.mlp.shared_expert.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -787,8 +743,6 @@
787
  "model.layers.29.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
788
  "model.layers.29.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
789
  "model.layers.29.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
790
- "model.layers.29.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
791
- "model.layers.29.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
792
  "model.layers.29.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
793
  "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
794
  "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -796,7 +750,7 @@
796
  "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00009.safetensors",
797
  "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
798
  "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
799
- "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
800
  "model.layers.3.input_layernorm.weight": "model-00001-of-00009.safetensors",
801
  "model.layers.3.mlp.gate.weight": "model-00001-of-00009.safetensors",
802
  "model.layers.3.mlp.shared_expert.down_proj.biases": "model-00001-of-00009.safetensors",
@@ -808,8 +762,6 @@
808
  "model.layers.3.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
809
  "model.layers.3.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
810
  "model.layers.3.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
811
- "model.layers.3.mlp.shared_expert_gate.biases": "model-00001-of-00009.safetensors",
812
- "model.layers.3.mlp.shared_expert_gate.scales": "model-00001-of-00009.safetensors",
813
  "model.layers.3.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
814
  "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
815
  "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
@@ -856,8 +808,6 @@
856
  "model.layers.30.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
857
  "model.layers.30.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
858
  "model.layers.30.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
859
- "model.layers.30.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
860
- "model.layers.30.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
861
  "model.layers.30.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
862
  "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
863
  "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -877,8 +827,6 @@
877
  "model.layers.31.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
878
  "model.layers.31.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
879
  "model.layers.31.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
880
- "model.layers.31.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
881
- "model.layers.31.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
882
  "model.layers.31.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
883
  "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
884
  "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -925,8 +873,6 @@
925
  "model.layers.32.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
926
  "model.layers.32.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
927
  "model.layers.32.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
928
- "model.layers.32.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
929
- "model.layers.32.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
930
  "model.layers.32.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
931
  "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
932
  "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -959,8 +905,6 @@
959
  "model.layers.33.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
960
  "model.layers.33.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
961
  "model.layers.33.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
962
- "model.layers.33.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
963
- "model.layers.33.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
964
  "model.layers.33.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
965
  "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
966
  "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -993,8 +937,6 @@
993
  "model.layers.34.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
994
  "model.layers.34.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
995
  "model.layers.34.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
996
- "model.layers.34.mlp.shared_expert_gate.biases": "model-00006-of-00009.safetensors",
997
- "model.layers.34.mlp.shared_expert_gate.scales": "model-00006-of-00009.safetensors",
998
  "model.layers.34.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
999
  "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
1000
  "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
@@ -1003,8 +945,8 @@
1003
  "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
1004
  "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
1005
  "model.layers.34.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
1006
- "model.layers.35.input_layernorm.weight": "model-00007-of-00009.safetensors",
1007
- "model.layers.35.mlp.gate.weight": "model-00007-of-00009.safetensors",
1008
  "model.layers.35.mlp.shared_expert.down_proj.biases": "model-00007-of-00009.safetensors",
1009
  "model.layers.35.mlp.shared_expert.down_proj.scales": "model-00007-of-00009.safetensors",
1010
  "model.layers.35.mlp.shared_expert.down_proj.weight": "model-00007-of-00009.safetensors",
@@ -1014,8 +956,6 @@
1014
  "model.layers.35.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1015
  "model.layers.35.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1016
  "model.layers.35.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
1017
- "model.layers.35.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
1018
- "model.layers.35.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
1019
  "model.layers.35.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1020
  "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1021
  "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
@@ -1023,21 +963,21 @@
1023
  "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00009.safetensors",
1024
  "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00007-of-00009.safetensors",
1025
  "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
1026
- "model.layers.35.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1027
- "model.layers.35.self_attn.k_norm.weight": "model-00007-of-00009.safetensors",
1028
- "model.layers.35.self_attn.k_proj.biases": "model-00007-of-00009.safetensors",
1029
- "model.layers.35.self_attn.k_proj.scales": "model-00007-of-00009.safetensors",
1030
- "model.layers.35.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
1031
- "model.layers.35.self_attn.o_proj.biases": "model-00007-of-00009.safetensors",
1032
- "model.layers.35.self_attn.o_proj.scales": "model-00007-of-00009.safetensors",
1033
- "model.layers.35.self_attn.o_proj.weight": "model-00007-of-00009.safetensors",
1034
- "model.layers.35.self_attn.q_norm.weight": "model-00007-of-00009.safetensors",
1035
- "model.layers.35.self_attn.q_proj.biases": "model-00007-of-00009.safetensors",
1036
- "model.layers.35.self_attn.q_proj.scales": "model-00007-of-00009.safetensors",
1037
- "model.layers.35.self_attn.q_proj.weight": "model-00007-of-00009.safetensors",
1038
- "model.layers.35.self_attn.v_proj.biases": "model-00007-of-00009.safetensors",
1039
- "model.layers.35.self_attn.v_proj.scales": "model-00007-of-00009.safetensors",
1040
- "model.layers.35.self_attn.v_proj.weight": "model-00007-of-00009.safetensors",
1041
  "model.layers.36.input_layernorm.weight": "model-00007-of-00009.safetensors",
1042
  "model.layers.36.linear_attn.A_log": "model-00007-of-00009.safetensors",
1043
  "model.layers.36.linear_attn.conv1d.weight": "model-00007-of-00009.safetensors",
@@ -1062,8 +1002,6 @@
1062
  "model.layers.36.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1063
  "model.layers.36.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1064
  "model.layers.36.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
1065
- "model.layers.36.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
1066
- "model.layers.36.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
1067
  "model.layers.36.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1068
  "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1069
  "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
@@ -1096,8 +1034,6 @@
1096
  "model.layers.37.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1097
  "model.layers.37.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1098
  "model.layers.37.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
1099
- "model.layers.37.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
1100
- "model.layers.37.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
1101
  "model.layers.37.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1102
  "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1103
  "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
@@ -1130,8 +1066,6 @@
1130
  "model.layers.38.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1131
  "model.layers.38.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1132
  "model.layers.38.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
1133
- "model.layers.38.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
1134
- "model.layers.38.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
1135
  "model.layers.38.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1136
  "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1137
  "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
@@ -1151,8 +1085,6 @@
1151
  "model.layers.39.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1152
  "model.layers.39.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1153
  "model.layers.39.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
1154
- "model.layers.39.mlp.shared_expert_gate.biases": "model-00007-of-00009.safetensors",
1155
- "model.layers.39.mlp.shared_expert_gate.scales": "model-00007-of-00009.safetensors",
1156
  "model.layers.39.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1157
  "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1158
  "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
@@ -1199,8 +1131,6 @@
1199
  "model.layers.4.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
1200
  "model.layers.4.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
1201
  "model.layers.4.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
1202
- "model.layers.4.mlp.shared_expert_gate.biases": "model-00001-of-00009.safetensors",
1203
- "model.layers.4.mlp.shared_expert_gate.scales": "model-00001-of-00009.safetensors",
1204
  "model.layers.4.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
1205
  "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
1206
  "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
@@ -1224,19 +1154,17 @@
1224
  "model.layers.40.linear_attn.out_proj.scales": "model-00007-of-00009.safetensors",
1225
  "model.layers.40.linear_attn.out_proj.weight": "model-00007-of-00009.safetensors",
1226
  "model.layers.40.mlp.gate.weight": "model-00007-of-00009.safetensors",
1227
- "model.layers.40.mlp.shared_expert.down_proj.biases": "model-00008-of-00009.safetensors",
1228
- "model.layers.40.mlp.shared_expert.down_proj.scales": "model-00008-of-00009.safetensors",
1229
- "model.layers.40.mlp.shared_expert.down_proj.weight": "model-00008-of-00009.safetensors",
1230
- "model.layers.40.mlp.shared_expert.gate_proj.biases": "model-00008-of-00009.safetensors",
1231
- "model.layers.40.mlp.shared_expert.gate_proj.scales": "model-00008-of-00009.safetensors",
1232
- "model.layers.40.mlp.shared_expert.gate_proj.weight": "model-00008-of-00009.safetensors",
1233
- "model.layers.40.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1234
- "model.layers.40.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1235
- "model.layers.40.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1236
- "model.layers.40.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1237
- "model.layers.40.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1238
- "model.layers.40.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1239
- "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1240
  "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
1241
  "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00009.safetensors",
1242
  "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00009.safetensors",
@@ -1244,16 +1172,16 @@
1244
  "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
1245
  "model.layers.40.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1246
  "model.layers.41.input_layernorm.weight": "model-00008-of-00009.safetensors",
1247
- "model.layers.41.linear_attn.A_log": "model-00008-of-00009.safetensors",
1248
- "model.layers.41.linear_attn.conv1d.weight": "model-00008-of-00009.safetensors",
1249
- "model.layers.41.linear_attn.dt_bias": "model-00008-of-00009.safetensors",
1250
- "model.layers.41.linear_attn.in_proj_ba.biases": "model-00008-of-00009.safetensors",
1251
- "model.layers.41.linear_attn.in_proj_ba.scales": "model-00008-of-00009.safetensors",
1252
- "model.layers.41.linear_attn.in_proj_ba.weight": "model-00008-of-00009.safetensors",
1253
- "model.layers.41.linear_attn.in_proj_qkvz.biases": "model-00008-of-00009.safetensors",
1254
- "model.layers.41.linear_attn.in_proj_qkvz.scales": "model-00008-of-00009.safetensors",
1255
- "model.layers.41.linear_attn.in_proj_qkvz.weight": "model-00008-of-00009.safetensors",
1256
- "model.layers.41.linear_attn.norm.weight": "model-00008-of-00009.safetensors",
1257
  "model.layers.41.linear_attn.out_proj.biases": "model-00008-of-00009.safetensors",
1258
  "model.layers.41.linear_attn.out_proj.scales": "model-00008-of-00009.safetensors",
1259
  "model.layers.41.linear_attn.out_proj.weight": "model-00008-of-00009.safetensors",
@@ -1267,8 +1195,6 @@
1267
  "model.layers.41.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1268
  "model.layers.41.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1269
  "model.layers.41.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1270
- "model.layers.41.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1271
- "model.layers.41.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1272
  "model.layers.41.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1273
  "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1274
  "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
@@ -1301,8 +1227,6 @@
1301
  "model.layers.42.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1302
  "model.layers.42.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1303
  "model.layers.42.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1304
- "model.layers.42.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1305
- "model.layers.42.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1306
  "model.layers.42.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1307
  "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1308
  "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
@@ -1322,8 +1246,6 @@
1322
  "model.layers.43.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1323
  "model.layers.43.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1324
  "model.layers.43.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1325
- "model.layers.43.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1326
- "model.layers.43.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1327
  "model.layers.43.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1328
  "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1329
  "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
@@ -1370,8 +1292,6 @@
1370
  "model.layers.44.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1371
  "model.layers.44.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1372
  "model.layers.44.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1373
- "model.layers.44.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1374
- "model.layers.44.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1375
  "model.layers.44.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1376
  "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1377
  "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
@@ -1404,8 +1324,6 @@
1404
  "model.layers.45.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1405
  "model.layers.45.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1406
  "model.layers.45.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1407
- "model.layers.45.mlp.shared_expert_gate.biases": "model-00008-of-00009.safetensors",
1408
- "model.layers.45.mlp.shared_expert_gate.scales": "model-00008-of-00009.safetensors",
1409
  "model.layers.45.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1410
  "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1411
  "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
@@ -1429,20 +1347,18 @@
1429
  "model.layers.46.linear_attn.out_proj.scales": "model-00008-of-00009.safetensors",
1430
  "model.layers.46.linear_attn.out_proj.weight": "model-00008-of-00009.safetensors",
1431
  "model.layers.46.mlp.gate.weight": "model-00008-of-00009.safetensors",
1432
- "model.layers.46.mlp.shared_expert.down_proj.biases": "model-00009-of-00009.safetensors",
1433
- "model.layers.46.mlp.shared_expert.down_proj.scales": "model-00009-of-00009.safetensors",
1434
- "model.layers.46.mlp.shared_expert.down_proj.weight": "model-00009-of-00009.safetensors",
1435
- "model.layers.46.mlp.shared_expert.gate_proj.biases": "model-00009-of-00009.safetensors",
1436
- "model.layers.46.mlp.shared_expert.gate_proj.scales": "model-00009-of-00009.safetensors",
1437
- "model.layers.46.mlp.shared_expert.gate_proj.weight": "model-00009-of-00009.safetensors",
1438
- "model.layers.46.mlp.shared_expert.up_proj.biases": "model-00009-of-00009.safetensors",
1439
- "model.layers.46.mlp.shared_expert.up_proj.scales": "model-00009-of-00009.safetensors",
1440
- "model.layers.46.mlp.shared_expert.up_proj.weight": "model-00009-of-00009.safetensors",
1441
- "model.layers.46.mlp.shared_expert_gate.biases": "model-00009-of-00009.safetensors",
1442
- "model.layers.46.mlp.shared_expert_gate.scales": "model-00009-of-00009.safetensors",
1443
- "model.layers.46.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
1444
- "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1445
- "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
1446
  "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
1447
  "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00009.safetensors",
1448
  "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00008-of-00009.safetensors",
@@ -1459,8 +1375,6 @@
1459
  "model.layers.47.mlp.shared_expert.up_proj.biases": "model-00009-of-00009.safetensors",
1460
  "model.layers.47.mlp.shared_expert.up_proj.scales": "model-00009-of-00009.safetensors",
1461
  "model.layers.47.mlp.shared_expert.up_proj.weight": "model-00009-of-00009.safetensors",
1462
- "model.layers.47.mlp.shared_expert_gate.biases": "model-00009-of-00009.safetensors",
1463
- "model.layers.47.mlp.shared_expert_gate.scales": "model-00009-of-00009.safetensors",
1464
  "model.layers.47.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
1465
  "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1466
  "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
@@ -1470,19 +1384,19 @@
1470
  "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00009-of-00009.safetensors",
1471
  "model.layers.47.post_attention_layernorm.weight": "model-00009-of-00009.safetensors",
1472
  "model.layers.47.self_attn.k_norm.weight": "model-00009-of-00009.safetensors",
1473
- "model.layers.47.self_attn.k_proj.biases": "model-00009-of-00009.safetensors",
1474
- "model.layers.47.self_attn.k_proj.scales": "model-00009-of-00009.safetensors",
1475
- "model.layers.47.self_attn.k_proj.weight": "model-00009-of-00009.safetensors",
1476
  "model.layers.47.self_attn.o_proj.biases": "model-00009-of-00009.safetensors",
1477
  "model.layers.47.self_attn.o_proj.scales": "model-00009-of-00009.safetensors",
1478
  "model.layers.47.self_attn.o_proj.weight": "model-00009-of-00009.safetensors",
1479
  "model.layers.47.self_attn.q_norm.weight": "model-00009-of-00009.safetensors",
1480
- "model.layers.47.self_attn.q_proj.biases": "model-00009-of-00009.safetensors",
1481
- "model.layers.47.self_attn.q_proj.scales": "model-00009-of-00009.safetensors",
1482
- "model.layers.47.self_attn.q_proj.weight": "model-00009-of-00009.safetensors",
1483
- "model.layers.47.self_attn.v_proj.biases": "model-00009-of-00009.safetensors",
1484
- "model.layers.47.self_attn.v_proj.scales": "model-00009-of-00009.safetensors",
1485
- "model.layers.47.self_attn.v_proj.weight": "model-00009-of-00009.safetensors",
1486
  "model.layers.5.input_layernorm.weight": "model-00001-of-00009.safetensors",
1487
  "model.layers.5.linear_attn.A_log": "model-00001-of-00009.safetensors",
1488
  "model.layers.5.linear_attn.conv1d.weight": "model-00001-of-00009.safetensors",
@@ -1507,8 +1421,6 @@
1507
  "model.layers.5.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1508
  "model.layers.5.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1509
  "model.layers.5.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
1510
- "model.layers.5.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
1511
- "model.layers.5.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
1512
  "model.layers.5.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1513
  "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1514
  "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
@@ -1541,8 +1453,6 @@
1541
  "model.layers.6.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1542
  "model.layers.6.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1543
  "model.layers.6.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
1544
- "model.layers.6.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
1545
- "model.layers.6.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
1546
  "model.layers.6.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1547
  "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1548
  "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
@@ -1562,8 +1472,6 @@
1562
  "model.layers.7.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1563
  "model.layers.7.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1564
  "model.layers.7.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
1565
- "model.layers.7.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
1566
- "model.layers.7.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
1567
  "model.layers.7.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1568
  "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1569
  "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
@@ -1610,8 +1518,6 @@
1610
  "model.layers.8.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1611
  "model.layers.8.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1612
  "model.layers.8.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
1613
- "model.layers.8.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
1614
- "model.layers.8.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
1615
  "model.layers.8.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1616
  "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1617
  "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
@@ -1644,8 +1550,6 @@
1644
  "model.layers.9.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1645
  "model.layers.9.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1646
  "model.layers.9.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
1647
- "model.layers.9.mlp.shared_expert_gate.biases": "model-00002-of-00009.safetensors",
1648
- "model.layers.9.mlp.shared_expert_gate.scales": "model-00002-of-00009.safetensors",
1649
  "model.layers.9.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1650
  "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1651
  "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 43631998464,
4
  "total_parameters": 79674388992
5
  },
6
  "weight_map": {
 
34
  "model.layers.0.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
35
  "model.layers.0.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
36
  "model.layers.0.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
 
 
37
  "model.layers.0.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
38
  "model.layers.0.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
39
  "model.layers.0.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
 
66
  "model.layers.1.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
67
  "model.layers.1.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
68
  "model.layers.1.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
 
 
69
  "model.layers.1.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
70
  "model.layers.1.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
71
  "model.layers.1.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
 
98
  "model.layers.10.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
99
  "model.layers.10.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
100
  "model.layers.10.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
 
 
101
  "model.layers.10.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
102
  "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
103
  "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
 
117
  "model.layers.11.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
118
  "model.layers.11.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
119
  "model.layers.11.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
 
 
120
  "model.layers.11.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
121
  "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
122
  "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
123
+ "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00009.safetensors",
124
  "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00009.safetensors",
125
  "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00003-of-00009.safetensors",
126
  "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00003-of-00009.safetensors",
 
163
  "model.layers.12.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
164
  "model.layers.12.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
165
  "model.layers.12.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
 
 
166
  "model.layers.12.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
167
  "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
168
  "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
 
195
  "model.layers.13.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
196
  "model.layers.13.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
197
  "model.layers.13.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
 
 
198
  "model.layers.13.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
199
  "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
200
  "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
 
227
  "model.layers.14.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
228
  "model.layers.14.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
229
  "model.layers.14.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
 
 
230
  "model.layers.14.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
231
  "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
232
  "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
 
246
  "model.layers.15.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
247
  "model.layers.15.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
248
  "model.layers.15.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
 
 
249
  "model.layers.15.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
250
  "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
251
  "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
 
292
  "model.layers.16.mlp.shared_expert.up_proj.biases": "model-00003-of-00009.safetensors",
293
  "model.layers.16.mlp.shared_expert.up_proj.scales": "model-00003-of-00009.safetensors",
294
  "model.layers.16.mlp.shared_expert.up_proj.weight": "model-00003-of-00009.safetensors",
 
 
295
  "model.layers.16.mlp.shared_expert_gate.weight": "model-00003-of-00009.safetensors",
296
  "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00003-of-00009.safetensors",
297
  "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00003-of-00009.safetensors",
 
324
  "model.layers.17.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
325
  "model.layers.17.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
326
  "model.layers.17.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
 
 
327
  "model.layers.17.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
328
  "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
329
  "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
330
  "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00009.safetensors",
331
+ "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00009.safetensors",
332
  "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00004-of-00009.safetensors",
333
  "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
334
  "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
 
356
  "model.layers.18.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
357
  "model.layers.18.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
358
  "model.layers.18.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
 
 
359
  "model.layers.18.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
360
  "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
361
  "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
 
375
  "model.layers.19.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
376
  "model.layers.19.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
377
  "model.layers.19.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
 
 
378
  "model.layers.19.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
379
  "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
380
  "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
 
421
  "model.layers.2.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
422
  "model.layers.2.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
423
  "model.layers.2.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
 
 
424
  "model.layers.2.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
425
  "model.layers.2.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
426
  "model.layers.2.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
 
453
  "model.layers.20.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
454
  "model.layers.20.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
455
  "model.layers.20.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
 
 
456
  "model.layers.20.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
457
  "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
458
  "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
 
485
  "model.layers.21.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
486
  "model.layers.21.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
487
  "model.layers.21.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
 
 
488
  "model.layers.21.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
489
  "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
490
  "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
 
517
  "model.layers.22.mlp.shared_expert.up_proj.biases": "model-00004-of-00009.safetensors",
518
  "model.layers.22.mlp.shared_expert.up_proj.scales": "model-00004-of-00009.safetensors",
519
  "model.layers.22.mlp.shared_expert.up_proj.weight": "model-00004-of-00009.safetensors",
 
 
520
  "model.layers.22.mlp.shared_expert_gate.weight": "model-00004-of-00009.safetensors",
521
  "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00004-of-00009.safetensors",
522
  "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00004-of-00009.safetensors",
 
526
  "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00004-of-00009.safetensors",
527
  "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
528
  "model.layers.23.input_layernorm.weight": "model-00004-of-00009.safetensors",
529
+ "model.layers.23.mlp.gate.weight": "model-00004-of-00009.safetensors",
530
  "model.layers.23.mlp.shared_expert.down_proj.biases": "model-00005-of-00009.safetensors",
531
  "model.layers.23.mlp.shared_expert.down_proj.scales": "model-00005-of-00009.safetensors",
532
  "model.layers.23.mlp.shared_expert.down_proj.weight": "model-00005-of-00009.safetensors",
 
536
  "model.layers.23.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
537
  "model.layers.23.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
538
  "model.layers.23.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
 
 
539
  "model.layers.23.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
540
  "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
541
  "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
542
  "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00009.safetensors",
543
+ "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00009.safetensors",
544
  "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
545
  "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
546
  "model.layers.23.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
 
582
  "model.layers.24.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
583
  "model.layers.24.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
584
  "model.layers.24.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
 
 
585
  "model.layers.24.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
586
  "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
587
  "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
 
614
  "model.layers.25.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
615
  "model.layers.25.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
616
  "model.layers.25.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
 
 
617
  "model.layers.25.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
618
  "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
619
  "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
 
646
  "model.layers.26.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
647
  "model.layers.26.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
648
  "model.layers.26.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
 
 
649
  "model.layers.26.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
650
  "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
651
  "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
 
665
  "model.layers.27.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
666
  "model.layers.27.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
667
  "model.layers.27.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
 
 
668
  "model.layers.27.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
669
  "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
670
  "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
 
711
  "model.layers.28.mlp.shared_expert.up_proj.biases": "model-00005-of-00009.safetensors",
712
  "model.layers.28.mlp.shared_expert.up_proj.scales": "model-00005-of-00009.safetensors",
713
  "model.layers.28.mlp.shared_expert.up_proj.weight": "model-00005-of-00009.safetensors",
 
 
714
  "model.layers.28.mlp.shared_expert_gate.weight": "model-00005-of-00009.safetensors",
715
  "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00005-of-00009.safetensors",
716
  "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00005-of-00009.safetensors",
 
719
  "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00005-of-00009.safetensors",
720
  "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00005-of-00009.safetensors",
721
  "model.layers.28.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
722
+ "model.layers.29.input_layernorm.weight": "model-00005-of-00009.safetensors",
723
+ "model.layers.29.linear_attn.A_log": "model-00005-of-00009.safetensors",
724
  "model.layers.29.linear_attn.conv1d.weight": "model-00005-of-00009.safetensors",
725
+ "model.layers.29.linear_attn.dt_bias": "model-00005-of-00009.safetensors",
726
+ "model.layers.29.linear_attn.in_proj_ba.biases": "model-00005-of-00009.safetensors",
727
+ "model.layers.29.linear_attn.in_proj_ba.scales": "model-00005-of-00009.safetensors",
728
+ "model.layers.29.linear_attn.in_proj_ba.weight": "model-00005-of-00009.safetensors",
729
+ "model.layers.29.linear_attn.in_proj_qkvz.biases": "model-00005-of-00009.safetensors",
730
  "model.layers.29.linear_attn.in_proj_qkvz.scales": "model-00005-of-00009.safetensors",
731
  "model.layers.29.linear_attn.in_proj_qkvz.weight": "model-00005-of-00009.safetensors",
732
+ "model.layers.29.linear_attn.norm.weight": "model-00005-of-00009.safetensors",
733
+ "model.layers.29.linear_attn.out_proj.biases": "model-00005-of-00009.safetensors",
734
+ "model.layers.29.linear_attn.out_proj.scales": "model-00005-of-00009.safetensors",
735
+ "model.layers.29.linear_attn.out_proj.weight": "model-00005-of-00009.safetensors",
736
+ "model.layers.29.mlp.gate.weight": "model-00005-of-00009.safetensors",
737
  "model.layers.29.mlp.shared_expert.down_proj.biases": "model-00006-of-00009.safetensors",
738
  "model.layers.29.mlp.shared_expert.down_proj.scales": "model-00006-of-00009.safetensors",
739
  "model.layers.29.mlp.shared_expert.down_proj.weight": "model-00006-of-00009.safetensors",
 
743
  "model.layers.29.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
744
  "model.layers.29.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
745
  "model.layers.29.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
 
 
746
  "model.layers.29.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
747
  "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
748
  "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
 
750
  "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00009.safetensors",
751
  "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
752
  "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
753
+ "model.layers.29.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
754
  "model.layers.3.input_layernorm.weight": "model-00001-of-00009.safetensors",
755
  "model.layers.3.mlp.gate.weight": "model-00001-of-00009.safetensors",
756
  "model.layers.3.mlp.shared_expert.down_proj.biases": "model-00001-of-00009.safetensors",
 
762
  "model.layers.3.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
763
  "model.layers.3.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
764
  "model.layers.3.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
 
 
765
  "model.layers.3.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
766
  "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
767
  "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
 
808
  "model.layers.30.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
809
  "model.layers.30.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
810
  "model.layers.30.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
 
 
811
  "model.layers.30.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
812
  "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
813
  "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
 
827
  "model.layers.31.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
828
  "model.layers.31.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
829
  "model.layers.31.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
 
 
830
  "model.layers.31.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
831
  "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
832
  "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
 
873
  "model.layers.32.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
874
  "model.layers.32.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
875
  "model.layers.32.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
 
 
876
  "model.layers.32.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
877
  "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
878
  "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
 
905
  "model.layers.33.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
906
  "model.layers.33.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
907
  "model.layers.33.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
 
 
908
  "model.layers.33.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
909
  "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
910
  "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
 
937
  "model.layers.34.mlp.shared_expert.up_proj.biases": "model-00006-of-00009.safetensors",
938
  "model.layers.34.mlp.shared_expert.up_proj.scales": "model-00006-of-00009.safetensors",
939
  "model.layers.34.mlp.shared_expert.up_proj.weight": "model-00006-of-00009.safetensors",
 
 
940
  "model.layers.34.mlp.shared_expert_gate.weight": "model-00006-of-00009.safetensors",
941
  "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00006-of-00009.safetensors",
942
  "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00006-of-00009.safetensors",
 
945
  "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00006-of-00009.safetensors",
946
  "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00006-of-00009.safetensors",
947
  "model.layers.34.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
948
+ "model.layers.35.input_layernorm.weight": "model-00006-of-00009.safetensors",
949
+ "model.layers.35.mlp.gate.weight": "model-00006-of-00009.safetensors",
950
  "model.layers.35.mlp.shared_expert.down_proj.biases": "model-00007-of-00009.safetensors",
951
  "model.layers.35.mlp.shared_expert.down_proj.scales": "model-00007-of-00009.safetensors",
952
  "model.layers.35.mlp.shared_expert.down_proj.weight": "model-00007-of-00009.safetensors",
 
956
  "model.layers.35.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
957
  "model.layers.35.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
958
  "model.layers.35.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
 
 
959
  "model.layers.35.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
960
  "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
961
  "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
 
963
  "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00009.safetensors",
964
  "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00007-of-00009.safetensors",
965
  "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
966
+ "model.layers.35.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
967
+ "model.layers.35.self_attn.k_norm.weight": "model-00006-of-00009.safetensors",
968
+ "model.layers.35.self_attn.k_proj.biases": "model-00006-of-00009.safetensors",
969
+ "model.layers.35.self_attn.k_proj.scales": "model-00006-of-00009.safetensors",
970
+ "model.layers.35.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
971
+ "model.layers.35.self_attn.o_proj.biases": "model-00006-of-00009.safetensors",
972
+ "model.layers.35.self_attn.o_proj.scales": "model-00006-of-00009.safetensors",
973
+ "model.layers.35.self_attn.o_proj.weight": "model-00006-of-00009.safetensors",
974
+ "model.layers.35.self_attn.q_norm.weight": "model-00006-of-00009.safetensors",
975
+ "model.layers.35.self_attn.q_proj.biases": "model-00006-of-00009.safetensors",
976
+ "model.layers.35.self_attn.q_proj.scales": "model-00006-of-00009.safetensors",
977
+ "model.layers.35.self_attn.q_proj.weight": "model-00006-of-00009.safetensors",
978
+ "model.layers.35.self_attn.v_proj.biases": "model-00006-of-00009.safetensors",
979
+ "model.layers.35.self_attn.v_proj.scales": "model-00006-of-00009.safetensors",
980
+ "model.layers.35.self_attn.v_proj.weight": "model-00006-of-00009.safetensors",
981
  "model.layers.36.input_layernorm.weight": "model-00007-of-00009.safetensors",
982
  "model.layers.36.linear_attn.A_log": "model-00007-of-00009.safetensors",
983
  "model.layers.36.linear_attn.conv1d.weight": "model-00007-of-00009.safetensors",
 
1002
  "model.layers.36.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1003
  "model.layers.36.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1004
  "model.layers.36.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
 
 
1005
  "model.layers.36.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1006
  "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1007
  "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
 
1034
  "model.layers.37.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1035
  "model.layers.37.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1036
  "model.layers.37.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
 
 
1037
  "model.layers.37.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1038
  "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1039
  "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
 
1066
  "model.layers.38.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1067
  "model.layers.38.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1068
  "model.layers.38.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
 
 
1069
  "model.layers.38.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1070
  "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1071
  "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
 
1085
  "model.layers.39.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1086
  "model.layers.39.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1087
  "model.layers.39.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
 
 
1088
  "model.layers.39.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1089
  "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
1090
  "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
 
1131
  "model.layers.4.mlp.shared_expert.up_proj.biases": "model-00001-of-00009.safetensors",
1132
  "model.layers.4.mlp.shared_expert.up_proj.scales": "model-00001-of-00009.safetensors",
1133
  "model.layers.4.mlp.shared_expert.up_proj.weight": "model-00001-of-00009.safetensors",
 
 
1134
  "model.layers.4.mlp.shared_expert_gate.weight": "model-00001-of-00009.safetensors",
1135
  "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00001-of-00009.safetensors",
1136
  "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00001-of-00009.safetensors",
 
1154
  "model.layers.40.linear_attn.out_proj.scales": "model-00007-of-00009.safetensors",
1155
  "model.layers.40.linear_attn.out_proj.weight": "model-00007-of-00009.safetensors",
1156
  "model.layers.40.mlp.gate.weight": "model-00007-of-00009.safetensors",
1157
+ "model.layers.40.mlp.shared_expert.down_proj.biases": "model-00007-of-00009.safetensors",
1158
+ "model.layers.40.mlp.shared_expert.down_proj.scales": "model-00007-of-00009.safetensors",
1159
+ "model.layers.40.mlp.shared_expert.down_proj.weight": "model-00007-of-00009.safetensors",
1160
+ "model.layers.40.mlp.shared_expert.gate_proj.biases": "model-00007-of-00009.safetensors",
1161
+ "model.layers.40.mlp.shared_expert.gate_proj.scales": "model-00007-of-00009.safetensors",
1162
+ "model.layers.40.mlp.shared_expert.gate_proj.weight": "model-00007-of-00009.safetensors",
1163
+ "model.layers.40.mlp.shared_expert.up_proj.biases": "model-00007-of-00009.safetensors",
1164
+ "model.layers.40.mlp.shared_expert.up_proj.scales": "model-00007-of-00009.safetensors",
1165
+ "model.layers.40.mlp.shared_expert.up_proj.weight": "model-00007-of-00009.safetensors",
1166
+ "model.layers.40.mlp.shared_expert_gate.weight": "model-00007-of-00009.safetensors",
1167
+ "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00007-of-00009.safetensors",
 
 
1168
  "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00007-of-00009.safetensors",
1169
  "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00009.safetensors",
1170
  "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00009.safetensors",
 
1172
  "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00007-of-00009.safetensors",
1173
  "model.layers.40.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
1174
  "model.layers.41.input_layernorm.weight": "model-00008-of-00009.safetensors",
1175
+ "model.layers.41.linear_attn.A_log": "model-00007-of-00009.safetensors",
1176
+ "model.layers.41.linear_attn.conv1d.weight": "model-00007-of-00009.safetensors",
1177
+ "model.layers.41.linear_attn.dt_bias": "model-00007-of-00009.safetensors",
1178
+ "model.layers.41.linear_attn.in_proj_ba.biases": "model-00007-of-00009.safetensors",
1179
+ "model.layers.41.linear_attn.in_proj_ba.scales": "model-00007-of-00009.safetensors",
1180
+ "model.layers.41.linear_attn.in_proj_ba.weight": "model-00007-of-00009.safetensors",
1181
+ "model.layers.41.linear_attn.in_proj_qkvz.biases": "model-00007-of-00009.safetensors",
1182
+ "model.layers.41.linear_attn.in_proj_qkvz.scales": "model-00007-of-00009.safetensors",
1183
+ "model.layers.41.linear_attn.in_proj_qkvz.weight": "model-00007-of-00009.safetensors",
1184
+ "model.layers.41.linear_attn.norm.weight": "model-00007-of-00009.safetensors",
1185
  "model.layers.41.linear_attn.out_proj.biases": "model-00008-of-00009.safetensors",
1186
  "model.layers.41.linear_attn.out_proj.scales": "model-00008-of-00009.safetensors",
1187
  "model.layers.41.linear_attn.out_proj.weight": "model-00008-of-00009.safetensors",
 
1195
  "model.layers.41.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1196
  "model.layers.41.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1197
  "model.layers.41.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
 
 
1198
  "model.layers.41.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1199
  "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1200
  "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
 
1227
  "model.layers.42.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1228
  "model.layers.42.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1229
  "model.layers.42.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
 
 
1230
  "model.layers.42.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1231
  "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1232
  "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
 
1246
  "model.layers.43.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1247
  "model.layers.43.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1248
  "model.layers.43.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
 
 
1249
  "model.layers.43.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1250
  "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1251
  "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
 
1292
  "model.layers.44.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1293
  "model.layers.44.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1294
  "model.layers.44.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
 
 
1295
  "model.layers.44.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1296
  "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1297
  "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
 
1324
  "model.layers.45.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1325
  "model.layers.45.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1326
  "model.layers.45.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
 
 
1327
  "model.layers.45.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1328
  "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1329
  "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
 
1347
  "model.layers.46.linear_attn.out_proj.scales": "model-00008-of-00009.safetensors",
1348
  "model.layers.46.linear_attn.out_proj.weight": "model-00008-of-00009.safetensors",
1349
  "model.layers.46.mlp.gate.weight": "model-00008-of-00009.safetensors",
1350
+ "model.layers.46.mlp.shared_expert.down_proj.biases": "model-00008-of-00009.safetensors",
1351
+ "model.layers.46.mlp.shared_expert.down_proj.scales": "model-00008-of-00009.safetensors",
1352
+ "model.layers.46.mlp.shared_expert.down_proj.weight": "model-00008-of-00009.safetensors",
1353
+ "model.layers.46.mlp.shared_expert.gate_proj.biases": "model-00008-of-00009.safetensors",
1354
+ "model.layers.46.mlp.shared_expert.gate_proj.scales": "model-00008-of-00009.safetensors",
1355
+ "model.layers.46.mlp.shared_expert.gate_proj.weight": "model-00008-of-00009.safetensors",
1356
+ "model.layers.46.mlp.shared_expert.up_proj.biases": "model-00008-of-00009.safetensors",
1357
+ "model.layers.46.mlp.shared_expert.up_proj.scales": "model-00008-of-00009.safetensors",
1358
+ "model.layers.46.mlp.shared_expert.up_proj.weight": "model-00008-of-00009.safetensors",
1359
+ "model.layers.46.mlp.shared_expert_gate.weight": "model-00008-of-00009.safetensors",
1360
+ "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00008-of-00009.safetensors",
1361
+ "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00008-of-00009.safetensors",
 
 
1362
  "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00009.safetensors",
1363
  "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00009.safetensors",
1364
  "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00008-of-00009.safetensors",
 
1375
  "model.layers.47.mlp.shared_expert.up_proj.biases": "model-00009-of-00009.safetensors",
1376
  "model.layers.47.mlp.shared_expert.up_proj.scales": "model-00009-of-00009.safetensors",
1377
  "model.layers.47.mlp.shared_expert.up_proj.weight": "model-00009-of-00009.safetensors",
 
 
1378
  "model.layers.47.mlp.shared_expert_gate.weight": "model-00009-of-00009.safetensors",
1379
  "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00009-of-00009.safetensors",
1380
  "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00009-of-00009.safetensors",
 
1384
  "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00009-of-00009.safetensors",
1385
  "model.layers.47.post_attention_layernorm.weight": "model-00009-of-00009.safetensors",
1386
  "model.layers.47.self_attn.k_norm.weight": "model-00009-of-00009.safetensors",
1387
+ "model.layers.47.self_attn.k_proj.biases": "model-00008-of-00009.safetensors",
1388
+ "model.layers.47.self_attn.k_proj.scales": "model-00008-of-00009.safetensors",
1389
+ "model.layers.47.self_attn.k_proj.weight": "model-00008-of-00009.safetensors",
1390
  "model.layers.47.self_attn.o_proj.biases": "model-00009-of-00009.safetensors",
1391
  "model.layers.47.self_attn.o_proj.scales": "model-00009-of-00009.safetensors",
1392
  "model.layers.47.self_attn.o_proj.weight": "model-00009-of-00009.safetensors",
1393
  "model.layers.47.self_attn.q_norm.weight": "model-00009-of-00009.safetensors",
1394
+ "model.layers.47.self_attn.q_proj.biases": "model-00008-of-00009.safetensors",
1395
+ "model.layers.47.self_attn.q_proj.scales": "model-00008-of-00009.safetensors",
1396
+ "model.layers.47.self_attn.q_proj.weight": "model-00008-of-00009.safetensors",
1397
+ "model.layers.47.self_attn.v_proj.biases": "model-00008-of-00009.safetensors",
1398
+ "model.layers.47.self_attn.v_proj.scales": "model-00008-of-00009.safetensors",
1399
+ "model.layers.47.self_attn.v_proj.weight": "model-00008-of-00009.safetensors",
1400
  "model.layers.5.input_layernorm.weight": "model-00001-of-00009.safetensors",
1401
  "model.layers.5.linear_attn.A_log": "model-00001-of-00009.safetensors",
1402
  "model.layers.5.linear_attn.conv1d.weight": "model-00001-of-00009.safetensors",
 
1421
  "model.layers.5.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1422
  "model.layers.5.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1423
  "model.layers.5.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
 
 
1424
  "model.layers.5.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1425
  "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1426
  "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
 
1453
  "model.layers.6.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1454
  "model.layers.6.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1455
  "model.layers.6.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
 
 
1456
  "model.layers.6.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1457
  "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1458
  "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
 
1472
  "model.layers.7.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1473
  "model.layers.7.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1474
  "model.layers.7.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
 
 
1475
  "model.layers.7.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1476
  "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1477
  "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
 
1518
  "model.layers.8.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1519
  "model.layers.8.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1520
  "model.layers.8.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
 
 
1521
  "model.layers.8.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1522
  "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1523
  "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",
 
1550
  "model.layers.9.mlp.shared_expert.up_proj.biases": "model-00002-of-00009.safetensors",
1551
  "model.layers.9.mlp.shared_expert.up_proj.scales": "model-00002-of-00009.safetensors",
1552
  "model.layers.9.mlp.shared_expert.up_proj.weight": "model-00002-of-00009.safetensors",
 
 
1553
  "model.layers.9.mlp.shared_expert_gate.weight": "model-00002-of-00009.safetensors",
1554
  "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00002-of-00009.safetensors",
1555
  "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00002-of-00009.safetensors",