File size: 23,020 Bytes
98b9392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
{
  "cycle": 6,
  "layer_merged": 21,
  "num_layers_before": 27,
  "num_layers_after": 26,
  "fused_layer_state": "fused_layer.pt",
  "dwce_score": 0.060528135887943944,
  "dwce_scores": [
    Infinity,
    Infinity,
    0.10885138210341537,
    0.09962797171315212,
    0.09704690759872063,
    0.10032807292174113,
    0.07477073061165104,
    0.06816870958942334,
    0.08890422275158102,
    0.1163330007130214,
    0.14856900172908535,
    0.15104770110436222,
    0.15976559604817797,
    0.16747390251227062,
    0.17331518344021776,
    0.14107504898742165,
    0.12868654620236872,
    0.13613157978454246,
    0.1218191001657377,
    0.10902217379118222,
    0.08936219242296228,
    0.060528135887943944,
    0.07539605212866492,
    0.09116655794274726,
    0.11185780392379165,
    Infinity
  ],
  "dwce_meta": {
    "per_pair": [
      {
        "excluded": true
      },
      {
        "excluded": true
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.45602849053786343,
          "self_attn.k_proj.weight": 0.393914402513139,
          "self_attn.v_proj.weight": 0.540396339894555,
          "self_attn.o_proj.weight": 0.4469669933138352,
          "mlp.gate_proj.weight": 0.3287149601078028,
          "mlp.up_proj.weight": 0.35017769317939573,
          "mlp.down_proj.weight": 0.396783937778306,
          "input_layernorm.weight": 0.17714660120924552,
          "post_attention_layernorm.weight": 0.5706843420160465
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.2956975268685422,
          "self_attn.k_proj.weight": 0.3772577526025228,
          "self_attn.v_proj.weight": 0.5824193735130013,
          "self_attn.o_proj.weight": 0.44336875810997856,
          "mlp.gate_proj.weight": 0.30769135298804196,
          "mlp.up_proj.weight": 0.35382330315519916,
          "mlp.down_proj.weight": 0.37185043779817395,
          "input_layernorm.weight": 0.7024250477029108,
          "post_attention_layernorm.weight": 0.5121384967983699
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.30319082595336366,
          "self_attn.k_proj.weight": 0.3492474196377044,
          "self_attn.v_proj.weight": 0.569890311972998,
          "self_attn.o_proj.weight": 0.40464328058154025,
          "mlp.gate_proj.weight": 0.28532597693591105,
          "mlp.up_proj.weight": 0.37282024323447943,
          "mlp.down_proj.weight": 0.3841916217498055,
          "input_layernorm.weight": 0.4863317106902496,
          "post_attention_layernorm.weight": 0.57501372396618
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.30239571422605777,
          "self_attn.k_proj.weight": 0.28777982633775234,
          "self_attn.v_proj.weight": 0.5214361921442343,
          "self_attn.o_proj.weight": 0.43105923012955555,
          "mlp.gate_proj.weight": 0.2730870548532783,
          "mlp.up_proj.weight": 0.3623753277725939,
          "mlp.down_proj.weight": 0.38708954857683764,
          "input_layernorm.weight": 0.4643158587476409,
          "post_attention_layernorm.weight": 0.4712457550840951
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.34829974358469984,
          "self_attn.k_proj.weight": 0.3124058890492968,
          "self_attn.v_proj.weight": 0.44195795428146895,
          "self_attn.o_proj.weight": 0.4143407830271052,
          "mlp.gate_proj.weight": 0.5,
          "mlp.up_proj.weight": 0.5509473359135564,
          "mlp.down_proj.weight": 0.5172651880216964,
          "input_layernorm.weight": 0.6178273084347614,
          "post_attention_layernorm.weight": 0.34427816773879094
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.2730859710153346,
          "self_attn.k_proj.weight": 0.3061184297688129,
          "self_attn.v_proj.weight": 0.533020418936679,
          "self_attn.o_proj.weight": 0.4119345004856529,
          "mlp.gate_proj.weight": 0.5,
          "mlp.up_proj.weight": 0.13066309879562818,
          "mlp.down_proj.weight": 0.2330601154961996,
          "input_layernorm.weight": 0.8059542094784746,
          "post_attention_layernorm.weight": 0.6625755866545766
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.48768939039675996,
          "self_attn.k_proj.weight": 0.4777739962444542,
          "self_attn.v_proj.weight": 0.5103851366152496,
          "self_attn.o_proj.weight": 0.43250945492739656,
          "mlp.gate_proj.weight": 0.3257707165607638,
          "mlp.up_proj.weight": 0.4096843334038478,
          "mlp.down_proj.weight": 0.34885861400447404,
          "input_layernorm.weight": 0.4200429536840167,
          "post_attention_layernorm.weight": 0.5367871634336786
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.29755787633442426,
          "self_attn.k_proj.weight": 0.37623755403743336,
          "self_attn.v_proj.weight": 0.6136683116996197,
          "self_attn.o_proj.weight": 0.5741847831544276,
          "mlp.gate_proj.weight": 0.27139084405979624,
          "mlp.up_proj.weight": 0.352502726534821,
          "mlp.down_proj.weight": 0.3594937163598074,
          "input_layernorm.weight": 0.5244406580066927,
          "post_attention_layernorm.weight": 0.5383591174280284
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.48784027196921265,
          "self_attn.k_proj.weight": 0.39846873275053135,
          "self_attn.v_proj.weight": 0.4618083281504365,
          "self_attn.o_proj.weight": 0.5396497221340433,
          "mlp.gate_proj.weight": 0.28731097517799203,
          "mlp.up_proj.weight": 0.37777335079811836,
          "mlp.down_proj.weight": 0.38422114872976587,
          "input_layernorm.weight": 0.5041621307381597,
          "post_attention_layernorm.weight": 0.5670639954113798
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.23827875940660836,
          "self_attn.k_proj.weight": 0.3238894105610408,
          "self_attn.v_proj.weight": 0.5690090563700821,
          "self_attn.o_proj.weight": 0.5450013202392769,
          "mlp.gate_proj.weight": 0.27271401584431,
          "mlp.up_proj.weight": 0.3708806850281214,
          "mlp.down_proj.weight": 0.39500743275208433,
          "input_layernorm.weight": 0.6987078362557021,
          "post_attention_layernorm.weight": 0.5374434799165528
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.3946032688367209,
          "self_attn.k_proj.weight": 0.40690493743203565,
          "self_attn.v_proj.weight": 0.5164469910119862,
          "self_attn.o_proj.weight": 0.4597598293804492,
          "mlp.gate_proj.weight": 0.269573396995867,
          "mlp.up_proj.weight": 0.35366279000533957,
          "mlp.down_proj.weight": 0.3877383926763193,
          "input_layernorm.weight": 0.5460628958641058,
          "post_attention_layernorm.weight": 0.5071179303686918
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.3008175631152415,
          "self_attn.v_proj.weight": 0.5493743908332103,
          "self_attn.o_proj.weight": 0.4661606538187534,
          "mlp.gate_proj.weight": 0.2783245620057637,
          "mlp.up_proj.weight": 0.36798475292532096,
          "mlp.down_proj.weight": 0.41990780468185046,
          "input_layernorm.weight": 0.3526699737302493,
          "post_attention_layernorm.weight": 0.5026090703676459
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.5,
          "self_attn.v_proj.weight": 0.5046790280321507,
          "self_attn.o_proj.weight": 0.47862372917098306,
          "mlp.gate_proj.weight": 0.28429170701639556,
          "mlp.up_proj.weight": 0.3568216404807901,
          "mlp.down_proj.weight": 0.4024126479370246,
          "input_layernorm.weight": 0.6964693773965374,
          "post_attention_layernorm.weight": 0.5389671586153284
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.5,
          "self_attn.v_proj.weight": 0.5711961200240154,
          "self_attn.o_proj.weight": 0.4091136757892634,
          "mlp.gate_proj.weight": 0.28165197173200957,
          "mlp.up_proj.weight": 0.3558940378823637,
          "mlp.down_proj.weight": 0.3944025471751477,
          "input_layernorm.weight": 0.34554249375842755,
          "post_attention_layernorm.weight": 0.5083749806126059
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.5,
          "self_attn.v_proj.weight": 0.4798904930258487,
          "self_attn.o_proj.weight": 0.3796993330110259,
          "mlp.gate_proj.weight": 0.2834371362960443,
          "mlp.up_proj.weight": 0.34598115404819363,
          "mlp.down_proj.weight": 0.3810985843525749,
          "input_layernorm.weight": 0.40754399908659844,
          "post_attention_layernorm.weight": 0.497314728281144
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.5,
          "self_attn.v_proj.weight": 0.5081409141199547,
          "self_attn.o_proj.weight": 0.3629102582375656,
          "mlp.gate_proj.weight": 0.2855739089361027,
          "mlp.up_proj.weight": 0.35880372082662226,
          "mlp.down_proj.weight": 0.38944742416779293,
          "input_layernorm.weight": 0.6608006181001739,
          "post_attention_layernorm.weight": 0.5296959070502865
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.5,
          "self_attn.v_proj.weight": 0.49572269146465,
          "self_attn.o_proj.weight": 0.32994298892262397,
          "mlp.gate_proj.weight": 0.27769206189679957,
          "mlp.up_proj.weight": 0.3455361882658386,
          "mlp.down_proj.weight": 0.3735752249423331,
          "input_layernorm.weight": 0.4170270576790789,
          "post_attention_layernorm.weight": 0.5209363007605031
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.5,
          "self_attn.v_proj.weight": 0.4867919706742694,
          "self_attn.o_proj.weight": 0.3538593097308571,
          "mlp.gate_proj.weight": 0.27117298636599874,
          "mlp.up_proj.weight": 0.33462896686497223,
          "mlp.down_proj.weight": 0.3612770373233193,
          "input_layernorm.weight": 0.6936974577548146,
          "post_attention_layernorm.weight": 0.5001178185439411
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.09349704918366467,
          "self_attn.k_proj.weight": 0.5,
          "self_attn.v_proj.weight": 0.47712484653665943,
          "self_attn.o_proj.weight": 0.29886608310554436,
          "mlp.gate_proj.weight": 0.2693949444903617,
          "mlp.up_proj.weight": 0.3291873676707558,
          "mlp.down_proj.weight": 0.35004469298030644,
          "input_layernorm.weight": 0.33369843125590304,
          "post_attention_layernorm.weight": 0.5011505559153778
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.40387717352354646,
          "self_attn.k_proj.weight": 0.32231430163408453,
          "self_attn.v_proj.weight": 0.42154930635885873,
          "self_attn.o_proj.weight": 0.2721077912952483,
          "mlp.gate_proj.weight": 0.5,
          "mlp.up_proj.weight": 0.40582628271960014,
          "mlp.down_proj.weight": 0.3570913409001458,
          "input_layernorm.weight": 0.5122671690645244,
          "post_attention_layernorm.weight": 0.49147814328550515
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.1520219921182753,
          "self_attn.k_proj.weight": 0.13721084571035752,
          "self_attn.v_proj.weight": 0.40551897093604916,
          "self_attn.o_proj.weight": 0.280649542041515,
          "mlp.gate_proj.weight": 0.5,
          "mlp.up_proj.weight": 0.14720318423611445,
          "mlp.down_proj.weight": 0.17429299506277318,
          "input_layernorm.weight": 0.09607062192796464,
          "post_attention_layernorm.weight": 0.44690356138952886
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.007074450011806872,
          "self_attn.k_proj.weight": 0.1615631246455786,
          "self_attn.v_proj.weight": 0.5877874751289588,
          "self_attn.o_proj.weight": 0.3278764246216598,
          "mlp.gate_proj.weight": 0.313893134211228,
          "mlp.up_proj.weight": 0.3528312375210034,
          "mlp.down_proj.weight": 0.35640966437698607,
          "input_layernorm.weight": 0.6657340014175253,
          "post_attention_layernorm.weight": 0.39247642201648
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.9892781643373502,
          "self_attn.k_proj.weight": 0.657748170777267,
          "self_attn.v_proj.weight": 0.4684578019872222,
          "self_attn.o_proj.weight": 0.4193140384970065,
          "mlp.gate_proj.weight": 0.5,
          "mlp.up_proj.weight": 0.3405968802707512,
          "mlp.down_proj.weight": 0.3304433709433969,
          "input_layernorm.weight": 0.6733460076353963,
          "post_attention_layernorm.weight": 0.15409983400561383
        },
        "metric": "dwce"
      },
      {
        "excluded": true
      }
    ],
    "supports_kwargs": true,
    "max_batches": 0,
    "norm": "relative",
    "metric": "dwce",
    "cosine_topk": 3,
    "start_index": 0,
    "excluded_pairs": [
      0,
      1,
      25
    ]
  },
  "fisher_num_batches": 64,
  "merge_method": "reparam",
  "merged_params": 3,
  "num_sequences": 128,
  "teacher_source": "previous_cycle_memory",
  "teacher_cycle": 5,
  "eval": {
    "datasets": [
      "wikitext"
    ],
    "configs": [
      "wikitext-2-raw-v1"
    ],
    "split": "test",
    "num_samples": 200,
    "seq_len": 1024,
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 19.929857094583117
    }
  },
  "comm": {
    "enabled": true,
    "train_mode": "lora",
    "opt_steps": 6,
    "grad_accum_steps": 1,
    "lr": 1e-05,
    "temp": 2.0,
    "steps_ratio": 0.1,
    "lr_scale": 0.1,
    "interaction_mode": "relative",
    "interaction_eps": 1e-08,
    "mu": 0.5,
    "mu_auto": true,
    "mu_auto_rho": 0.1,
    "mu_auto_eps": 1e-08,
    "sample_eta": 0.5,
    "sample_dwce_scale": 1.0,
    "topk": 1,
    "candidate_pairs": [
      21
    ],
    "trainable_params": 17694720,
    "pair_selection": {
      "num_pairs": 26,
      "excluded_pairs": [
        0,
        1,
        25
      ],
      "candidate_pairs": [
        21
      ],
      "total_samples": 6,
      "unique_pairs": 1,
      "counts": [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        6,
        0,
        0,
        0,
        0
      ],
      "freqs": [
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0
      ],
      "probs": [
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0
      ],
      "top_pairs": [
        {
          "pair": 21,
          "count": 6,
          "freq": 1.0,
          "prob": 1.0
        }
      ]
    },
    "avg_loss": 0.0004060502259892,
    "avg_anchor": 0.0002513490423249702,
    "avg_interaction": 0.09605961292982101,
    "avg_mu": 0.0015758845697112923,
    "teacher_source": "previous_cycle_memory",
    "teacher_cycle": 5,
    "dwce_scores_pre": [
      Infinity,
      Infinity,
      0.10885138210341537,
      0.09962797171315212,
      0.09704690759872063,
      0.10032807292174113,
      0.07477073061165104,
      0.06816870958942334,
      0.08890422275158102,
      0.1163330007130214,
      0.14856900172908535,
      0.15104770110436222,
      0.15976559604817797,
      0.16747390251227062,
      0.17331518344021776,
      0.14107504898742165,
      0.12868654620236872,
      0.13613157978454246,
      0.1218191001657377,
      0.10902217379118222,
      0.08936219242296228,
      0.060528135887943944,
      0.07539605212866492,
      0.09116655794274726,
      0.11185780392379165,
      Infinity
    ],
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 18.54056249689346
    },
    "post_selection_recomputed": false,
    "selected_layer_post": 21
  },
  "distill": {
    "enabled": true,
    "method": "reparam",
    "calib_samples": 256,
    "inst_samples": 0,
    "seq_len": 1024,
    "batch_size": 1,
    "epochs": 1.0,
    "lr": 0.0001,
    "kl_weight": 0.02,
    "kl_temp": 4.0,
    "hidden_mse_weight": 1.0,
    "attn_mse_weight": 0.25,
    "mlp_mse_weight": 1.0,
    "reparam_eta": 0.0,
    "reparam_gamma": 0.0,
    "reparam_attn_reg_scale": 1.0,
    "reparam_mlp_reg_scale": 1.0,
    "reparam_param_subset": "mlp",
    "reparam_stats": {
      "enabled": true,
      "epochs": 1.0,
      "lr": 0.0001,
      "hidden_mse_weight": 1.0,
      "attn_mse_weight": 0.25,
      "mlp_mse_weight": 1.0,
      "eta": 0.0,
      "gamma": 0.0,
      "attn_reg_scale": 1.0,
      "mlp_reg_scale": 1.0,
      "param_subset": "mlp",
      "num_gates": 3,
      "num_attn_gates": 0,
      "num_mlp_gates": 3,
      "num_other_gates": 0,
      "lambda_init": "fisher_prior"
    },
    "reparam_gate_summary": {
      "num_tensors": 3,
      "num_elements": 176160768,
      "global_mean": 0.48417738505772184,
      "per_tensor_mean": {
        "mlp.gate_proj.weight": 0.4960911273956299,
        "mlp.up_proj.weight": 0.4809410572052002,
        "mlp.down_proj.weight": 0.47550004720687866
      }
    },
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 19.929857094583117
    },
    "weight_decay": 0.0,
    "max_grad_norm": 1.0,
    "grad_accum_steps": 1,
    "instruction_dataset": null,
    "instruction_config": null,
    "instruction_split": "train"
  },
  "lora": {
    "enabled": false,
    "seq_len": 1024,
    "batch_size": 1,
    "epochs": 0.0,
    "rank": 8,
    "alpha": 16.0,
    "dropout": 0.0,
    "target_modules": [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj",
      "gate_proj",
      "down_proj",
      "up_proj"
    ],
    "respect_exclude_pairs": false,
    "kl_enabled": false,
    "kl_weight": 0.1,
    "kl_temp": 4.0,
    "post_ppl": null,
    "lr": 0.0001,
    "weight_decay": 0.0,
    "max_grad_norm": 1.0,
    "grad_accum_steps": 1,
    "log_steps": 100,
    "eval_every": 2000,
    "eval_max_batches": null
  },
  "norm_policy": "hybrid",
  "full_model_saved": true,
  "full_model": {
    "stage": "cycle_6_full_model",
    "path": "llama3_mlp_skipdwce_nopermute/cycle_6/full_model",
    "weight_bytes": 13460487442,
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 19.929857094583117
    },
    "resume_info": "resume_info.json"
  }
}