cpatonn commited on
Commit
b867599
·
verified ·
1 Parent(s): 229d277

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +62 -4
config.json CHANGED
@@ -98,11 +98,9 @@
98
  "num_bits": 8,
99
  "observer": "mse",
100
  "observer_kwargs": {},
101
- "scale_dtype": null,
102
  "strategy": "group",
103
  "symmetric": true,
104
- "type": "int",
105
- "zp_dtype": null
106
  }
107
  }
108
  },
@@ -113,6 +111,7 @@
113
  "model.layers.0.linear_attn.in_proj_ba",
114
  "model.layers.0.linear_attn.out_proj",
115
  "model.layers.0.mlp.gate",
 
116
  "model.layers.0.mlp.shared_expert.gate_proj",
117
  "model.layers.0.mlp.shared_expert.up_proj",
118
  "model.layers.0.mlp.shared_expert.down_proj",
@@ -121,6 +120,7 @@
121
  "model.layers.1.linear_attn.in_proj_ba",
122
  "model.layers.1.linear_attn.out_proj",
123
  "model.layers.1.mlp.gate",
 
124
  "model.layers.1.mlp.shared_expert.gate_proj",
125
  "model.layers.1.mlp.shared_expert.up_proj",
126
  "model.layers.1.mlp.shared_expert.down_proj",
@@ -129,15 +129,18 @@
129
  "model.layers.2.linear_attn.in_proj_ba",
130
  "model.layers.2.linear_attn.out_proj",
131
  "model.layers.2.mlp.gate",
 
132
  "model.layers.2.mlp.shared_expert.gate_proj",
133
  "model.layers.2.mlp.shared_expert.up_proj",
134
  "model.layers.2.mlp.shared_expert.down_proj",
135
  "model.layers.2.mlp.shared_expert_gate",
 
136
  "model.layers.3.self_attn.q_proj",
137
  "model.layers.3.self_attn.k_proj",
138
  "model.layers.3.self_attn.v_proj",
139
  "model.layers.3.self_attn.o_proj",
140
  "model.layers.3.mlp.gate",
 
141
  "model.layers.3.mlp.shared_expert.gate_proj",
142
  "model.layers.3.mlp.shared_expert.up_proj",
143
  "model.layers.3.mlp.shared_expert.down_proj",
@@ -146,6 +149,7 @@
146
  "model.layers.4.linear_attn.in_proj_ba",
147
  "model.layers.4.linear_attn.out_proj",
148
  "model.layers.4.mlp.gate",
 
149
  "model.layers.4.mlp.shared_expert.gate_proj",
150
  "model.layers.4.mlp.shared_expert.up_proj",
151
  "model.layers.4.mlp.shared_expert.down_proj",
@@ -154,6 +158,7 @@
154
  "model.layers.5.linear_attn.in_proj_ba",
155
  "model.layers.5.linear_attn.out_proj",
156
  "model.layers.5.mlp.gate",
 
157
  "model.layers.5.mlp.shared_expert.gate_proj",
158
  "model.layers.5.mlp.shared_expert.up_proj",
159
  "model.layers.5.mlp.shared_expert.down_proj",
@@ -162,15 +167,18 @@
162
  "model.layers.6.linear_attn.in_proj_ba",
163
  "model.layers.6.linear_attn.out_proj",
164
  "model.layers.6.mlp.gate",
 
165
  "model.layers.6.mlp.shared_expert.gate_proj",
166
  "model.layers.6.mlp.shared_expert.up_proj",
167
  "model.layers.6.mlp.shared_expert.down_proj",
168
  "model.layers.6.mlp.shared_expert_gate",
 
169
  "model.layers.7.self_attn.q_proj",
170
  "model.layers.7.self_attn.k_proj",
171
  "model.layers.7.self_attn.v_proj",
172
  "model.layers.7.self_attn.o_proj",
173
  "model.layers.7.mlp.gate",
 
174
  "model.layers.7.mlp.shared_expert.gate_proj",
175
  "model.layers.7.mlp.shared_expert.up_proj",
176
  "model.layers.7.mlp.shared_expert.down_proj",
@@ -179,6 +187,7 @@
179
  "model.layers.8.linear_attn.in_proj_ba",
180
  "model.layers.8.linear_attn.out_proj",
181
  "model.layers.8.mlp.gate",
 
182
  "model.layers.8.mlp.shared_expert.gate_proj",
183
  "model.layers.8.mlp.shared_expert.up_proj",
184
  "model.layers.8.mlp.shared_expert.down_proj",
@@ -187,6 +196,7 @@
187
  "model.layers.9.linear_attn.in_proj_ba",
188
  "model.layers.9.linear_attn.out_proj",
189
  "model.layers.9.mlp.gate",
 
190
  "model.layers.9.mlp.shared_expert.gate_proj",
191
  "model.layers.9.mlp.shared_expert.up_proj",
192
  "model.layers.9.mlp.shared_expert.down_proj",
@@ -195,15 +205,18 @@
195
  "model.layers.10.linear_attn.in_proj_ba",
196
  "model.layers.10.linear_attn.out_proj",
197
  "model.layers.10.mlp.gate",
 
198
  "model.layers.10.mlp.shared_expert.gate_proj",
199
  "model.layers.10.mlp.shared_expert.up_proj",
200
  "model.layers.10.mlp.shared_expert.down_proj",
201
  "model.layers.10.mlp.shared_expert_gate",
 
202
  "model.layers.11.self_attn.q_proj",
203
  "model.layers.11.self_attn.k_proj",
204
  "model.layers.11.self_attn.v_proj",
205
  "model.layers.11.self_attn.o_proj",
206
  "model.layers.11.mlp.gate",
 
207
  "model.layers.11.mlp.shared_expert.gate_proj",
208
  "model.layers.11.mlp.shared_expert.up_proj",
209
  "model.layers.11.mlp.shared_expert.down_proj",
@@ -212,6 +225,7 @@
212
  "model.layers.12.linear_attn.in_proj_ba",
213
  "model.layers.12.linear_attn.out_proj",
214
  "model.layers.12.mlp.gate",
 
215
  "model.layers.12.mlp.shared_expert.gate_proj",
216
  "model.layers.12.mlp.shared_expert.up_proj",
217
  "model.layers.12.mlp.shared_expert.down_proj",
@@ -220,6 +234,7 @@
220
  "model.layers.13.linear_attn.in_proj_ba",
221
  "model.layers.13.linear_attn.out_proj",
222
  "model.layers.13.mlp.gate",
 
223
  "model.layers.13.mlp.shared_expert.gate_proj",
224
  "model.layers.13.mlp.shared_expert.up_proj",
225
  "model.layers.13.mlp.shared_expert.down_proj",
@@ -228,15 +243,18 @@
228
  "model.layers.14.linear_attn.in_proj_ba",
229
  "model.layers.14.linear_attn.out_proj",
230
  "model.layers.14.mlp.gate",
 
231
  "model.layers.14.mlp.shared_expert.gate_proj",
232
  "model.layers.14.mlp.shared_expert.up_proj",
233
  "model.layers.14.mlp.shared_expert.down_proj",
234
  "model.layers.14.mlp.shared_expert_gate",
 
235
  "model.layers.15.self_attn.q_proj",
236
  "model.layers.15.self_attn.k_proj",
237
  "model.layers.15.self_attn.v_proj",
238
  "model.layers.15.self_attn.o_proj",
239
  "model.layers.15.mlp.gate",
 
240
  "model.layers.15.mlp.shared_expert.gate_proj",
241
  "model.layers.15.mlp.shared_expert.up_proj",
242
  "model.layers.15.mlp.shared_expert.down_proj",
@@ -245,6 +263,7 @@
245
  "model.layers.16.linear_attn.in_proj_ba",
246
  "model.layers.16.linear_attn.out_proj",
247
  "model.layers.16.mlp.gate",
 
248
  "model.layers.16.mlp.shared_expert.gate_proj",
249
  "model.layers.16.mlp.shared_expert.up_proj",
250
  "model.layers.16.mlp.shared_expert.down_proj",
@@ -253,6 +272,7 @@
253
  "model.layers.17.linear_attn.in_proj_ba",
254
  "model.layers.17.linear_attn.out_proj",
255
  "model.layers.17.mlp.gate",
 
256
  "model.layers.17.mlp.shared_expert.gate_proj",
257
  "model.layers.17.mlp.shared_expert.up_proj",
258
  "model.layers.17.mlp.shared_expert.down_proj",
@@ -261,15 +281,18 @@
261
  "model.layers.18.linear_attn.in_proj_ba",
262
  "model.layers.18.linear_attn.out_proj",
263
  "model.layers.18.mlp.gate",
 
264
  "model.layers.18.mlp.shared_expert.gate_proj",
265
  "model.layers.18.mlp.shared_expert.up_proj",
266
  "model.layers.18.mlp.shared_expert.down_proj",
267
  "model.layers.18.mlp.shared_expert_gate",
 
268
  "model.layers.19.self_attn.q_proj",
269
  "model.layers.19.self_attn.k_proj",
270
  "model.layers.19.self_attn.v_proj",
271
  "model.layers.19.self_attn.o_proj",
272
  "model.layers.19.mlp.gate",
 
273
  "model.layers.19.mlp.shared_expert.gate_proj",
274
  "model.layers.19.mlp.shared_expert.up_proj",
275
  "model.layers.19.mlp.shared_expert.down_proj",
@@ -278,6 +301,7 @@
278
  "model.layers.20.linear_attn.in_proj_ba",
279
  "model.layers.20.linear_attn.out_proj",
280
  "model.layers.20.mlp.gate",
 
281
  "model.layers.20.mlp.shared_expert.gate_proj",
282
  "model.layers.20.mlp.shared_expert.up_proj",
283
  "model.layers.20.mlp.shared_expert.down_proj",
@@ -286,6 +310,7 @@
286
  "model.layers.21.linear_attn.in_proj_ba",
287
  "model.layers.21.linear_attn.out_proj",
288
  "model.layers.21.mlp.gate",
 
289
  "model.layers.21.mlp.shared_expert.gate_proj",
290
  "model.layers.21.mlp.shared_expert.up_proj",
291
  "model.layers.21.mlp.shared_expert.down_proj",
@@ -294,15 +319,18 @@
294
  "model.layers.22.linear_attn.in_proj_ba",
295
  "model.layers.22.linear_attn.out_proj",
296
  "model.layers.22.mlp.gate",
 
297
  "model.layers.22.mlp.shared_expert.gate_proj",
298
  "model.layers.22.mlp.shared_expert.up_proj",
299
  "model.layers.22.mlp.shared_expert.down_proj",
300
  "model.layers.22.mlp.shared_expert_gate",
 
301
  "model.layers.23.self_attn.q_proj",
302
  "model.layers.23.self_attn.k_proj",
303
  "model.layers.23.self_attn.v_proj",
304
  "model.layers.23.self_attn.o_proj",
305
  "model.layers.23.mlp.gate",
 
306
  "model.layers.23.mlp.shared_expert.gate_proj",
307
  "model.layers.23.mlp.shared_expert.up_proj",
308
  "model.layers.23.mlp.shared_expert.down_proj",
@@ -311,6 +339,7 @@
311
  "model.layers.24.linear_attn.in_proj_ba",
312
  "model.layers.24.linear_attn.out_proj",
313
  "model.layers.24.mlp.gate",
 
314
  "model.layers.24.mlp.shared_expert.gate_proj",
315
  "model.layers.24.mlp.shared_expert.up_proj",
316
  "model.layers.24.mlp.shared_expert.down_proj",
@@ -319,6 +348,7 @@
319
  "model.layers.25.linear_attn.in_proj_ba",
320
  "model.layers.25.linear_attn.out_proj",
321
  "model.layers.25.mlp.gate",
 
322
  "model.layers.25.mlp.shared_expert.gate_proj",
323
  "model.layers.25.mlp.shared_expert.up_proj",
324
  "model.layers.25.mlp.shared_expert.down_proj",
@@ -327,15 +357,18 @@
327
  "model.layers.26.linear_attn.in_proj_ba",
328
  "model.layers.26.linear_attn.out_proj",
329
  "model.layers.26.mlp.gate",
 
330
  "model.layers.26.mlp.shared_expert.gate_proj",
331
  "model.layers.26.mlp.shared_expert.up_proj",
332
  "model.layers.26.mlp.shared_expert.down_proj",
333
  "model.layers.26.mlp.shared_expert_gate",
 
334
  "model.layers.27.self_attn.q_proj",
335
  "model.layers.27.self_attn.k_proj",
336
  "model.layers.27.self_attn.v_proj",
337
  "model.layers.27.self_attn.o_proj",
338
  "model.layers.27.mlp.gate",
 
339
  "model.layers.27.mlp.shared_expert.gate_proj",
340
  "model.layers.27.mlp.shared_expert.up_proj",
341
  "model.layers.27.mlp.shared_expert.down_proj",
@@ -344,6 +377,7 @@
344
  "model.layers.28.linear_attn.in_proj_ba",
345
  "model.layers.28.linear_attn.out_proj",
346
  "model.layers.28.mlp.gate",
 
347
  "model.layers.28.mlp.shared_expert.gate_proj",
348
  "model.layers.28.mlp.shared_expert.up_proj",
349
  "model.layers.28.mlp.shared_expert.down_proj",
@@ -352,6 +386,7 @@
352
  "model.layers.29.linear_attn.in_proj_ba",
353
  "model.layers.29.linear_attn.out_proj",
354
  "model.layers.29.mlp.gate",
 
355
  "model.layers.29.mlp.shared_expert.gate_proj",
356
  "model.layers.29.mlp.shared_expert.up_proj",
357
  "model.layers.29.mlp.shared_expert.down_proj",
@@ -360,15 +395,18 @@
360
  "model.layers.30.linear_attn.in_proj_ba",
361
  "model.layers.30.linear_attn.out_proj",
362
  "model.layers.30.mlp.gate",
 
363
  "model.layers.30.mlp.shared_expert.gate_proj",
364
  "model.layers.30.mlp.shared_expert.up_proj",
365
  "model.layers.30.mlp.shared_expert.down_proj",
366
  "model.layers.30.mlp.shared_expert_gate",
 
367
  "model.layers.31.self_attn.q_proj",
368
  "model.layers.31.self_attn.k_proj",
369
  "model.layers.31.self_attn.v_proj",
370
  "model.layers.31.self_attn.o_proj",
371
  "model.layers.31.mlp.gate",
 
372
  "model.layers.31.mlp.shared_expert.gate_proj",
373
  "model.layers.31.mlp.shared_expert.up_proj",
374
  "model.layers.31.mlp.shared_expert.down_proj",
@@ -377,6 +415,7 @@
377
  "model.layers.32.linear_attn.in_proj_ba",
378
  "model.layers.32.linear_attn.out_proj",
379
  "model.layers.32.mlp.gate",
 
380
  "model.layers.32.mlp.shared_expert.gate_proj",
381
  "model.layers.32.mlp.shared_expert.up_proj",
382
  "model.layers.32.mlp.shared_expert.down_proj",
@@ -385,6 +424,7 @@
385
  "model.layers.33.linear_attn.in_proj_ba",
386
  "model.layers.33.linear_attn.out_proj",
387
  "model.layers.33.mlp.gate",
 
388
  "model.layers.33.mlp.shared_expert.gate_proj",
389
  "model.layers.33.mlp.shared_expert.up_proj",
390
  "model.layers.33.mlp.shared_expert.down_proj",
@@ -393,15 +433,18 @@
393
  "model.layers.34.linear_attn.in_proj_ba",
394
  "model.layers.34.linear_attn.out_proj",
395
  "model.layers.34.mlp.gate",
 
396
  "model.layers.34.mlp.shared_expert.gate_proj",
397
  "model.layers.34.mlp.shared_expert.up_proj",
398
  "model.layers.34.mlp.shared_expert.down_proj",
399
  "model.layers.34.mlp.shared_expert_gate",
 
400
  "model.layers.35.self_attn.q_proj",
401
  "model.layers.35.self_attn.k_proj",
402
  "model.layers.35.self_attn.v_proj",
403
  "model.layers.35.self_attn.o_proj",
404
  "model.layers.35.mlp.gate",
 
405
  "model.layers.35.mlp.shared_expert.gate_proj",
406
  "model.layers.35.mlp.shared_expert.up_proj",
407
  "model.layers.35.mlp.shared_expert.down_proj",
@@ -410,6 +453,7 @@
410
  "model.layers.36.linear_attn.in_proj_ba",
411
  "model.layers.36.linear_attn.out_proj",
412
  "model.layers.36.mlp.gate",
 
413
  "model.layers.36.mlp.shared_expert.gate_proj",
414
  "model.layers.36.mlp.shared_expert.up_proj",
415
  "model.layers.36.mlp.shared_expert.down_proj",
@@ -418,6 +462,7 @@
418
  "model.layers.37.linear_attn.in_proj_ba",
419
  "model.layers.37.linear_attn.out_proj",
420
  "model.layers.37.mlp.gate",
 
421
  "model.layers.37.mlp.shared_expert.gate_proj",
422
  "model.layers.37.mlp.shared_expert.up_proj",
423
  "model.layers.37.mlp.shared_expert.down_proj",
@@ -426,15 +471,18 @@
426
  "model.layers.38.linear_attn.in_proj_ba",
427
  "model.layers.38.linear_attn.out_proj",
428
  "model.layers.38.mlp.gate",
 
429
  "model.layers.38.mlp.shared_expert.gate_proj",
430
  "model.layers.38.mlp.shared_expert.up_proj",
431
  "model.layers.38.mlp.shared_expert.down_proj",
432
  "model.layers.38.mlp.shared_expert_gate",
 
433
  "model.layers.39.self_attn.q_proj",
434
  "model.layers.39.self_attn.k_proj",
435
  "model.layers.39.self_attn.v_proj",
436
  "model.layers.39.self_attn.o_proj",
437
  "model.layers.39.mlp.gate",
 
438
  "model.layers.39.mlp.shared_expert.gate_proj",
439
  "model.layers.39.mlp.shared_expert.up_proj",
440
  "model.layers.39.mlp.shared_expert.down_proj",
@@ -443,6 +491,7 @@
443
  "model.layers.40.linear_attn.in_proj_ba",
444
  "model.layers.40.linear_attn.out_proj",
445
  "model.layers.40.mlp.gate",
 
446
  "model.layers.40.mlp.shared_expert.gate_proj",
447
  "model.layers.40.mlp.shared_expert.up_proj",
448
  "model.layers.40.mlp.shared_expert.down_proj",
@@ -451,6 +500,7 @@
451
  "model.layers.41.linear_attn.in_proj_ba",
452
  "model.layers.41.linear_attn.out_proj",
453
  "model.layers.41.mlp.gate",
 
454
  "model.layers.41.mlp.shared_expert.gate_proj",
455
  "model.layers.41.mlp.shared_expert.up_proj",
456
  "model.layers.41.mlp.shared_expert.down_proj",
@@ -459,15 +509,18 @@
459
  "model.layers.42.linear_attn.in_proj_ba",
460
  "model.layers.42.linear_attn.out_proj",
461
  "model.layers.42.mlp.gate",
 
462
  "model.layers.42.mlp.shared_expert.gate_proj",
463
  "model.layers.42.mlp.shared_expert.up_proj",
464
  "model.layers.42.mlp.shared_expert.down_proj",
465
  "model.layers.42.mlp.shared_expert_gate",
 
466
  "model.layers.43.self_attn.q_proj",
467
  "model.layers.43.self_attn.k_proj",
468
  "model.layers.43.self_attn.v_proj",
469
  "model.layers.43.self_attn.o_proj",
470
  "model.layers.43.mlp.gate",
 
471
  "model.layers.43.mlp.shared_expert.gate_proj",
472
  "model.layers.43.mlp.shared_expert.up_proj",
473
  "model.layers.43.mlp.shared_expert.down_proj",
@@ -476,6 +529,7 @@
476
  "model.layers.44.linear_attn.in_proj_ba",
477
  "model.layers.44.linear_attn.out_proj",
478
  "model.layers.44.mlp.gate",
 
479
  "model.layers.44.mlp.shared_expert.gate_proj",
480
  "model.layers.44.mlp.shared_expert.up_proj",
481
  "model.layers.44.mlp.shared_expert.down_proj",
@@ -484,6 +538,7 @@
484
  "model.layers.45.linear_attn.in_proj_ba",
485
  "model.layers.45.linear_attn.out_proj",
486
  "model.layers.45.mlp.gate",
 
487
  "model.layers.45.mlp.shared_expert.gate_proj",
488
  "model.layers.45.mlp.shared_expert.up_proj",
489
  "model.layers.45.mlp.shared_expert.down_proj",
@@ -492,15 +547,18 @@
492
  "model.layers.46.linear_attn.in_proj_ba",
493
  "model.layers.46.linear_attn.out_proj",
494
  "model.layers.46.mlp.gate",
 
495
  "model.layers.46.mlp.shared_expert.gate_proj",
496
  "model.layers.46.mlp.shared_expert.up_proj",
497
  "model.layers.46.mlp.shared_expert.down_proj",
498
  "model.layers.46.mlp.shared_expert_gate",
 
499
  "model.layers.47.self_attn.q_proj",
500
  "model.layers.47.self_attn.k_proj",
501
  "model.layers.47.self_attn.v_proj",
502
  "model.layers.47.self_attn.o_proj",
503
  "model.layers.47.mlp.gate",
 
504
  "model.layers.47.mlp.shared_expert.gate_proj",
505
  "model.layers.47.mlp.shared_expert.up_proj",
506
  "model.layers.47.mlp.shared_expert.down_proj",
@@ -512,7 +570,7 @@
512
  "quantization_status": "compressed",
513
  "sparsity_config": {},
514
  "transform_config": {},
515
- "version": "0.13.1.a20260217"
516
  },
517
  "rms_norm_eps": 1e-06,
518
  "rope_scaling": null,
 
98
  "num_bits": 8,
99
  "observer": "mse",
100
  "observer_kwargs": {},
 
101
  "strategy": "group",
102
  "symmetric": true,
103
+ "type": "int"
 
104
  }
105
  }
106
  },
 
111
  "model.layers.0.linear_attn.in_proj_ba",
112
  "model.layers.0.linear_attn.out_proj",
113
  "model.layers.0.mlp.gate",
114
+ "model.layers.0.mlp.shared_expert.gate_up_proj",
115
  "model.layers.0.mlp.shared_expert.gate_proj",
116
  "model.layers.0.mlp.shared_expert.up_proj",
117
  "model.layers.0.mlp.shared_expert.down_proj",
 
120
  "model.layers.1.linear_attn.in_proj_ba",
121
  "model.layers.1.linear_attn.out_proj",
122
  "model.layers.1.mlp.gate",
123
+ "model.layers.1.mlp.shared_expert.gate_up_proj",
124
  "model.layers.1.mlp.shared_expert.gate_proj",
125
  "model.layers.1.mlp.shared_expert.up_proj",
126
  "model.layers.1.mlp.shared_expert.down_proj",
 
129
  "model.layers.2.linear_attn.in_proj_ba",
130
  "model.layers.2.linear_attn.out_proj",
131
  "model.layers.2.mlp.gate",
132
+ "model.layers.2.mlp.shared_expert.gate_up_proj",
133
  "model.layers.2.mlp.shared_expert.gate_proj",
134
  "model.layers.2.mlp.shared_expert.up_proj",
135
  "model.layers.2.mlp.shared_expert.down_proj",
136
  "model.layers.2.mlp.shared_expert_gate",
137
+ "model.layers.3.self_attn.qkv_proj",
138
  "model.layers.3.self_attn.q_proj",
139
  "model.layers.3.self_attn.k_proj",
140
  "model.layers.3.self_attn.v_proj",
141
  "model.layers.3.self_attn.o_proj",
142
  "model.layers.3.mlp.gate",
143
+ "model.layers.3.mlp.shared_expert.gate_up_proj",
144
  "model.layers.3.mlp.shared_expert.gate_proj",
145
  "model.layers.3.mlp.shared_expert.up_proj",
146
  "model.layers.3.mlp.shared_expert.down_proj",
 
149
  "model.layers.4.linear_attn.in_proj_ba",
150
  "model.layers.4.linear_attn.out_proj",
151
  "model.layers.4.mlp.gate",
152
+ "model.layers.4.mlp.shared_expert.gate_up_proj",
153
  "model.layers.4.mlp.shared_expert.gate_proj",
154
  "model.layers.4.mlp.shared_expert.up_proj",
155
  "model.layers.4.mlp.shared_expert.down_proj",
 
158
  "model.layers.5.linear_attn.in_proj_ba",
159
  "model.layers.5.linear_attn.out_proj",
160
  "model.layers.5.mlp.gate",
161
+ "model.layers.5.mlp.shared_expert.gate_up_proj",
162
  "model.layers.5.mlp.shared_expert.gate_proj",
163
  "model.layers.5.mlp.shared_expert.up_proj",
164
  "model.layers.5.mlp.shared_expert.down_proj",
 
167
  "model.layers.6.linear_attn.in_proj_ba",
168
  "model.layers.6.linear_attn.out_proj",
169
  "model.layers.6.mlp.gate",
170
+ "model.layers.6.mlp.shared_expert.gate_up_proj",
171
  "model.layers.6.mlp.shared_expert.gate_proj",
172
  "model.layers.6.mlp.shared_expert.up_proj",
173
  "model.layers.6.mlp.shared_expert.down_proj",
174
  "model.layers.6.mlp.shared_expert_gate",
175
+ "model.layers.7.self_attn.qkv_proj",
176
  "model.layers.7.self_attn.q_proj",
177
  "model.layers.7.self_attn.k_proj",
178
  "model.layers.7.self_attn.v_proj",
179
  "model.layers.7.self_attn.o_proj",
180
  "model.layers.7.mlp.gate",
181
+ "model.layers.7.mlp.shared_expert.gate_up_proj",
182
  "model.layers.7.mlp.shared_expert.gate_proj",
183
  "model.layers.7.mlp.shared_expert.up_proj",
184
  "model.layers.7.mlp.shared_expert.down_proj",
 
187
  "model.layers.8.linear_attn.in_proj_ba",
188
  "model.layers.8.linear_attn.out_proj",
189
  "model.layers.8.mlp.gate",
190
+ "model.layers.8.mlp.shared_expert.gate_up_proj",
191
  "model.layers.8.mlp.shared_expert.gate_proj",
192
  "model.layers.8.mlp.shared_expert.up_proj",
193
  "model.layers.8.mlp.shared_expert.down_proj",
 
196
  "model.layers.9.linear_attn.in_proj_ba",
197
  "model.layers.9.linear_attn.out_proj",
198
  "model.layers.9.mlp.gate",
199
+ "model.layers.9.mlp.shared_expert.gate_up_proj",
200
  "model.layers.9.mlp.shared_expert.gate_proj",
201
  "model.layers.9.mlp.shared_expert.up_proj",
202
  "model.layers.9.mlp.shared_expert.down_proj",
 
205
  "model.layers.10.linear_attn.in_proj_ba",
206
  "model.layers.10.linear_attn.out_proj",
207
  "model.layers.10.mlp.gate",
208
+ "model.layers.10.mlp.shared_expert.gate_up_proj",
209
  "model.layers.10.mlp.shared_expert.gate_proj",
210
  "model.layers.10.mlp.shared_expert.up_proj",
211
  "model.layers.10.mlp.shared_expert.down_proj",
212
  "model.layers.10.mlp.shared_expert_gate",
213
+ "model.layers.11.self_attn.qkv_proj",
214
  "model.layers.11.self_attn.q_proj",
215
  "model.layers.11.self_attn.k_proj",
216
  "model.layers.11.self_attn.v_proj",
217
  "model.layers.11.self_attn.o_proj",
218
  "model.layers.11.mlp.gate",
219
+ "model.layers.11.mlp.shared_expert.gate_up_proj",
220
  "model.layers.11.mlp.shared_expert.gate_proj",
221
  "model.layers.11.mlp.shared_expert.up_proj",
222
  "model.layers.11.mlp.shared_expert.down_proj",
 
225
  "model.layers.12.linear_attn.in_proj_ba",
226
  "model.layers.12.linear_attn.out_proj",
227
  "model.layers.12.mlp.gate",
228
+ "model.layers.12.mlp.shared_expert.gate_up_proj",
229
  "model.layers.12.mlp.shared_expert.gate_proj",
230
  "model.layers.12.mlp.shared_expert.up_proj",
231
  "model.layers.12.mlp.shared_expert.down_proj",
 
234
  "model.layers.13.linear_attn.in_proj_ba",
235
  "model.layers.13.linear_attn.out_proj",
236
  "model.layers.13.mlp.gate",
237
+ "model.layers.13.mlp.shared_expert.gate_up_proj",
238
  "model.layers.13.mlp.shared_expert.gate_proj",
239
  "model.layers.13.mlp.shared_expert.up_proj",
240
  "model.layers.13.mlp.shared_expert.down_proj",
 
243
  "model.layers.14.linear_attn.in_proj_ba",
244
  "model.layers.14.linear_attn.out_proj",
245
  "model.layers.14.mlp.gate",
246
+ "model.layers.14.mlp.shared_expert.gate_up_proj",
247
  "model.layers.14.mlp.shared_expert.gate_proj",
248
  "model.layers.14.mlp.shared_expert.up_proj",
249
  "model.layers.14.mlp.shared_expert.down_proj",
250
  "model.layers.14.mlp.shared_expert_gate",
251
+ "model.layers.15.self_attn.qkv_proj",
252
  "model.layers.15.self_attn.q_proj",
253
  "model.layers.15.self_attn.k_proj",
254
  "model.layers.15.self_attn.v_proj",
255
  "model.layers.15.self_attn.o_proj",
256
  "model.layers.15.mlp.gate",
257
+ "model.layers.15.mlp.shared_expert.gate_up_proj",
258
  "model.layers.15.mlp.shared_expert.gate_proj",
259
  "model.layers.15.mlp.shared_expert.up_proj",
260
  "model.layers.15.mlp.shared_expert.down_proj",
 
263
  "model.layers.16.linear_attn.in_proj_ba",
264
  "model.layers.16.linear_attn.out_proj",
265
  "model.layers.16.mlp.gate",
266
+ "model.layers.16.mlp.shared_expert.gate_up_proj",
267
  "model.layers.16.mlp.shared_expert.gate_proj",
268
  "model.layers.16.mlp.shared_expert.up_proj",
269
  "model.layers.16.mlp.shared_expert.down_proj",
 
272
  "model.layers.17.linear_attn.in_proj_ba",
273
  "model.layers.17.linear_attn.out_proj",
274
  "model.layers.17.mlp.gate",
275
+ "model.layers.17.mlp.shared_expert.gate_up_proj",
276
  "model.layers.17.mlp.shared_expert.gate_proj",
277
  "model.layers.17.mlp.shared_expert.up_proj",
278
  "model.layers.17.mlp.shared_expert.down_proj",
 
281
  "model.layers.18.linear_attn.in_proj_ba",
282
  "model.layers.18.linear_attn.out_proj",
283
  "model.layers.18.mlp.gate",
284
+ "model.layers.18.mlp.shared_expert.gate_up_proj",
285
  "model.layers.18.mlp.shared_expert.gate_proj",
286
  "model.layers.18.mlp.shared_expert.up_proj",
287
  "model.layers.18.mlp.shared_expert.down_proj",
288
  "model.layers.18.mlp.shared_expert_gate",
289
+ "model.layers.19.self_attn.qkv_proj",
290
  "model.layers.19.self_attn.q_proj",
291
  "model.layers.19.self_attn.k_proj",
292
  "model.layers.19.self_attn.v_proj",
293
  "model.layers.19.self_attn.o_proj",
294
  "model.layers.19.mlp.gate",
295
+ "model.layers.19.mlp.shared_expert.gate_up_proj",
296
  "model.layers.19.mlp.shared_expert.gate_proj",
297
  "model.layers.19.mlp.shared_expert.up_proj",
298
  "model.layers.19.mlp.shared_expert.down_proj",
 
301
  "model.layers.20.linear_attn.in_proj_ba",
302
  "model.layers.20.linear_attn.out_proj",
303
  "model.layers.20.mlp.gate",
304
+ "model.layers.20.mlp.shared_expert.gate_up_proj",
305
  "model.layers.20.mlp.shared_expert.gate_proj",
306
  "model.layers.20.mlp.shared_expert.up_proj",
307
  "model.layers.20.mlp.shared_expert.down_proj",
 
310
  "model.layers.21.linear_attn.in_proj_ba",
311
  "model.layers.21.linear_attn.out_proj",
312
  "model.layers.21.mlp.gate",
313
+ "model.layers.21.mlp.shared_expert.gate_up_proj",
314
  "model.layers.21.mlp.shared_expert.gate_proj",
315
  "model.layers.21.mlp.shared_expert.up_proj",
316
  "model.layers.21.mlp.shared_expert.down_proj",
 
319
  "model.layers.22.linear_attn.in_proj_ba",
320
  "model.layers.22.linear_attn.out_proj",
321
  "model.layers.22.mlp.gate",
322
+ "model.layers.22.mlp.shared_expert.gate_up_proj",
323
  "model.layers.22.mlp.shared_expert.gate_proj",
324
  "model.layers.22.mlp.shared_expert.up_proj",
325
  "model.layers.22.mlp.shared_expert.down_proj",
326
  "model.layers.22.mlp.shared_expert_gate",
327
+ "model.layers.23.self_attn.qkv_proj",
328
  "model.layers.23.self_attn.q_proj",
329
  "model.layers.23.self_attn.k_proj",
330
  "model.layers.23.self_attn.v_proj",
331
  "model.layers.23.self_attn.o_proj",
332
  "model.layers.23.mlp.gate",
333
+ "model.layers.23.mlp.shared_expert.gate_up_proj",
334
  "model.layers.23.mlp.shared_expert.gate_proj",
335
  "model.layers.23.mlp.shared_expert.up_proj",
336
  "model.layers.23.mlp.shared_expert.down_proj",
 
339
  "model.layers.24.linear_attn.in_proj_ba",
340
  "model.layers.24.linear_attn.out_proj",
341
  "model.layers.24.mlp.gate",
342
+ "model.layers.24.mlp.shared_expert.gate_up_proj",
343
  "model.layers.24.mlp.shared_expert.gate_proj",
344
  "model.layers.24.mlp.shared_expert.up_proj",
345
  "model.layers.24.mlp.shared_expert.down_proj",
 
348
  "model.layers.25.linear_attn.in_proj_ba",
349
  "model.layers.25.linear_attn.out_proj",
350
  "model.layers.25.mlp.gate",
351
+ "model.layers.25.mlp.shared_expert.gate_up_proj",
352
  "model.layers.25.mlp.shared_expert.gate_proj",
353
  "model.layers.25.mlp.shared_expert.up_proj",
354
  "model.layers.25.mlp.shared_expert.down_proj",
 
357
  "model.layers.26.linear_attn.in_proj_ba",
358
  "model.layers.26.linear_attn.out_proj",
359
  "model.layers.26.mlp.gate",
360
+ "model.layers.26.mlp.shared_expert.gate_up_proj",
361
  "model.layers.26.mlp.shared_expert.gate_proj",
362
  "model.layers.26.mlp.shared_expert.up_proj",
363
  "model.layers.26.mlp.shared_expert.down_proj",
364
  "model.layers.26.mlp.shared_expert_gate",
365
+ "model.layers.27.self_attn.qkv_proj",
366
  "model.layers.27.self_attn.q_proj",
367
  "model.layers.27.self_attn.k_proj",
368
  "model.layers.27.self_attn.v_proj",
369
  "model.layers.27.self_attn.o_proj",
370
  "model.layers.27.mlp.gate",
371
+ "model.layers.27.mlp.shared_expert.gate_up_proj",
372
  "model.layers.27.mlp.shared_expert.gate_proj",
373
  "model.layers.27.mlp.shared_expert.up_proj",
374
  "model.layers.27.mlp.shared_expert.down_proj",
 
377
  "model.layers.28.linear_attn.in_proj_ba",
378
  "model.layers.28.linear_attn.out_proj",
379
  "model.layers.28.mlp.gate",
380
+ "model.layers.28.mlp.shared_expert.gate_up_proj",
381
  "model.layers.28.mlp.shared_expert.gate_proj",
382
  "model.layers.28.mlp.shared_expert.up_proj",
383
  "model.layers.28.mlp.shared_expert.down_proj",
 
386
  "model.layers.29.linear_attn.in_proj_ba",
387
  "model.layers.29.linear_attn.out_proj",
388
  "model.layers.29.mlp.gate",
389
+ "model.layers.29.mlp.shared_expert.gate_up_proj",
390
  "model.layers.29.mlp.shared_expert.gate_proj",
391
  "model.layers.29.mlp.shared_expert.up_proj",
392
  "model.layers.29.mlp.shared_expert.down_proj",
 
395
  "model.layers.30.linear_attn.in_proj_ba",
396
  "model.layers.30.linear_attn.out_proj",
397
  "model.layers.30.mlp.gate",
398
+ "model.layers.30.mlp.shared_expert.gate_up_proj",
399
  "model.layers.30.mlp.shared_expert.gate_proj",
400
  "model.layers.30.mlp.shared_expert.up_proj",
401
  "model.layers.30.mlp.shared_expert.down_proj",
402
  "model.layers.30.mlp.shared_expert_gate",
403
+ "model.layers.31.self_attn.qkv_proj",
404
  "model.layers.31.self_attn.q_proj",
405
  "model.layers.31.self_attn.k_proj",
406
  "model.layers.31.self_attn.v_proj",
407
  "model.layers.31.self_attn.o_proj",
408
  "model.layers.31.mlp.gate",
409
+ "model.layers.31.mlp.shared_expert.gate_up_proj",
410
  "model.layers.31.mlp.shared_expert.gate_proj",
411
  "model.layers.31.mlp.shared_expert.up_proj",
412
  "model.layers.31.mlp.shared_expert.down_proj",
 
415
  "model.layers.32.linear_attn.in_proj_ba",
416
  "model.layers.32.linear_attn.out_proj",
417
  "model.layers.32.mlp.gate",
418
+ "model.layers.32.mlp.shared_expert.gate_up_proj",
419
  "model.layers.32.mlp.shared_expert.gate_proj",
420
  "model.layers.32.mlp.shared_expert.up_proj",
421
  "model.layers.32.mlp.shared_expert.down_proj",
 
424
  "model.layers.33.linear_attn.in_proj_ba",
425
  "model.layers.33.linear_attn.out_proj",
426
  "model.layers.33.mlp.gate",
427
+ "model.layers.33.mlp.shared_expert.gate_up_proj",
428
  "model.layers.33.mlp.shared_expert.gate_proj",
429
  "model.layers.33.mlp.shared_expert.up_proj",
430
  "model.layers.33.mlp.shared_expert.down_proj",
 
433
  "model.layers.34.linear_attn.in_proj_ba",
434
  "model.layers.34.linear_attn.out_proj",
435
  "model.layers.34.mlp.gate",
436
+ "model.layers.34.mlp.shared_expert.gate_up_proj",
437
  "model.layers.34.mlp.shared_expert.gate_proj",
438
  "model.layers.34.mlp.shared_expert.up_proj",
439
  "model.layers.34.mlp.shared_expert.down_proj",
440
  "model.layers.34.mlp.shared_expert_gate",
441
+ "model.layers.35.self_attn.qkv_proj",
442
  "model.layers.35.self_attn.q_proj",
443
  "model.layers.35.self_attn.k_proj",
444
  "model.layers.35.self_attn.v_proj",
445
  "model.layers.35.self_attn.o_proj",
446
  "model.layers.35.mlp.gate",
447
+ "model.layers.35.mlp.shared_expert.gate_up_proj",
448
  "model.layers.35.mlp.shared_expert.gate_proj",
449
  "model.layers.35.mlp.shared_expert.up_proj",
450
  "model.layers.35.mlp.shared_expert.down_proj",
 
453
  "model.layers.36.linear_attn.in_proj_ba",
454
  "model.layers.36.linear_attn.out_proj",
455
  "model.layers.36.mlp.gate",
456
+ "model.layers.36.mlp.shared_expert.gate_up_proj",
457
  "model.layers.36.mlp.shared_expert.gate_proj",
458
  "model.layers.36.mlp.shared_expert.up_proj",
459
  "model.layers.36.mlp.shared_expert.down_proj",
 
462
  "model.layers.37.linear_attn.in_proj_ba",
463
  "model.layers.37.linear_attn.out_proj",
464
  "model.layers.37.mlp.gate",
465
+ "model.layers.37.mlp.shared_expert.gate_up_proj",
466
  "model.layers.37.mlp.shared_expert.gate_proj",
467
  "model.layers.37.mlp.shared_expert.up_proj",
468
  "model.layers.37.mlp.shared_expert.down_proj",
 
471
  "model.layers.38.linear_attn.in_proj_ba",
472
  "model.layers.38.linear_attn.out_proj",
473
  "model.layers.38.mlp.gate",
474
+ "model.layers.38.mlp.shared_expert.gate_up_proj",
475
  "model.layers.38.mlp.shared_expert.gate_proj",
476
  "model.layers.38.mlp.shared_expert.up_proj",
477
  "model.layers.38.mlp.shared_expert.down_proj",
478
  "model.layers.38.mlp.shared_expert_gate",
479
+ "model.layers.39.self_attn.qkv_proj",
480
  "model.layers.39.self_attn.q_proj",
481
  "model.layers.39.self_attn.k_proj",
482
  "model.layers.39.self_attn.v_proj",
483
  "model.layers.39.self_attn.o_proj",
484
  "model.layers.39.mlp.gate",
485
+ "model.layers.39.mlp.shared_expert.gate_up_proj",
486
  "model.layers.39.mlp.shared_expert.gate_proj",
487
  "model.layers.39.mlp.shared_expert.up_proj",
488
  "model.layers.39.mlp.shared_expert.down_proj",
 
491
  "model.layers.40.linear_attn.in_proj_ba",
492
  "model.layers.40.linear_attn.out_proj",
493
  "model.layers.40.mlp.gate",
494
+ "model.layers.40.mlp.shared_expert.gate_up_proj",
495
  "model.layers.40.mlp.shared_expert.gate_proj",
496
  "model.layers.40.mlp.shared_expert.up_proj",
497
  "model.layers.40.mlp.shared_expert.down_proj",
 
500
  "model.layers.41.linear_attn.in_proj_ba",
501
  "model.layers.41.linear_attn.out_proj",
502
  "model.layers.41.mlp.gate",
503
+ "model.layers.41.mlp.shared_expert.gate_up_proj",
504
  "model.layers.41.mlp.shared_expert.gate_proj",
505
  "model.layers.41.mlp.shared_expert.up_proj",
506
  "model.layers.41.mlp.shared_expert.down_proj",
 
509
  "model.layers.42.linear_attn.in_proj_ba",
510
  "model.layers.42.linear_attn.out_proj",
511
  "model.layers.42.mlp.gate",
512
+ "model.layers.42.mlp.shared_expert.gate_up_proj",
513
  "model.layers.42.mlp.shared_expert.gate_proj",
514
  "model.layers.42.mlp.shared_expert.up_proj",
515
  "model.layers.42.mlp.shared_expert.down_proj",
516
  "model.layers.42.mlp.shared_expert_gate",
517
+ "model.layers.43.self_attn.qkv_proj",
518
  "model.layers.43.self_attn.q_proj",
519
  "model.layers.43.self_attn.k_proj",
520
  "model.layers.43.self_attn.v_proj",
521
  "model.layers.43.self_attn.o_proj",
522
  "model.layers.43.mlp.gate",
523
+ "model.layers.43.mlp.shared_expert.gate_up_proj",
524
  "model.layers.43.mlp.shared_expert.gate_proj",
525
  "model.layers.43.mlp.shared_expert.up_proj",
526
  "model.layers.43.mlp.shared_expert.down_proj",
 
529
  "model.layers.44.linear_attn.in_proj_ba",
530
  "model.layers.44.linear_attn.out_proj",
531
  "model.layers.44.mlp.gate",
532
+ "model.layers.44.mlp.shared_expert.gate_up_proj",
533
  "model.layers.44.mlp.shared_expert.gate_proj",
534
  "model.layers.44.mlp.shared_expert.up_proj",
535
  "model.layers.44.mlp.shared_expert.down_proj",
 
538
  "model.layers.45.linear_attn.in_proj_ba",
539
  "model.layers.45.linear_attn.out_proj",
540
  "model.layers.45.mlp.gate",
541
+ "model.layers.45.mlp.shared_expert.gate_up_proj",
542
  "model.layers.45.mlp.shared_expert.gate_proj",
543
  "model.layers.45.mlp.shared_expert.up_proj",
544
  "model.layers.45.mlp.shared_expert.down_proj",
 
547
  "model.layers.46.linear_attn.in_proj_ba",
548
  "model.layers.46.linear_attn.out_proj",
549
  "model.layers.46.mlp.gate",
550
+ "model.layers.46.mlp.shared_expert.gate_up_proj",
551
  "model.layers.46.mlp.shared_expert.gate_proj",
552
  "model.layers.46.mlp.shared_expert.up_proj",
553
  "model.layers.46.mlp.shared_expert.down_proj",
554
  "model.layers.46.mlp.shared_expert_gate",
555
+ "model.layers.47.self_attn.qkv_proj",
556
  "model.layers.47.self_attn.q_proj",
557
  "model.layers.47.self_attn.k_proj",
558
  "model.layers.47.self_attn.v_proj",
559
  "model.layers.47.self_attn.o_proj",
560
  "model.layers.47.mlp.gate",
561
+ "model.layers.47.mlp.shared_expert.gate_up_proj",
562
  "model.layers.47.mlp.shared_expert.gate_proj",
563
  "model.layers.47.mlp.shared_expert.up_proj",
564
  "model.layers.47.mlp.shared_expert.down_proj",
 
570
  "quantization_status": "compressed",
571
  "sparsity_config": {},
572
  "transform_config": {},
573
+ "version": "0.12.3.a20251114"
574
  },
575
  "rms_norm_eps": 1e-06,
576
  "rope_scaling": null,