File size: 19,305 Bytes
ec45673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
{
  "cycle": 11,
  "layer_merged": 16,
  "num_layers_before": 22,
  "num_layers_after": 21,
  "fused_layer_state": "fused_layer.pt",
  "dwce_score": 0.04662290344732959,
  "dwce_scores": [
    Infinity,
    Infinity,
    0.08755428350373622,
    0.09632033766022144,
    0.12102187652892513,
    0.07687722047289398,
    0.07646200702022135,
    0.10762881995392454,
    0.11769039904744791,
    0.13281737352618223,
    0.13359121989702824,
    0.12213174133312057,
    0.09479378554179792,
    0.0702238861787414,
    0.09607953229105111,
    0.1022260119464429,
    0.04662290344732959,
    0.08633913939177026,
    0.14327522026540693,
    0.14370887781314598,
    Infinity
  ],
  "dwce_meta": {
    "per_pair": [
      {
        "excluded": true
      },
      {
        "excluded": true
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5475512621731512,
          "self_attn.k_proj.weight": 0.391473276857112,
          "self_attn.v_proj.weight": 0.4909271140725403,
          "self_attn.o_proj.weight": 0.5059257993820573,
          "mlp.gate_proj.weight": 0.5142934415231778,
          "mlp.up_proj.weight": 0.564209682386205,
          "mlp.down_proj.weight": 0.5882046140979224,
          "input_layernorm.weight": 0.08591573704035245,
          "post_attention_layernorm.weight": 0.6916345704649807
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.42295415260881286,
          "self_attn.k_proj.weight": 0.6006598545932649,
          "self_attn.v_proj.weight": 0.5839130619828803,
          "self_attn.o_proj.weight": 0.3061736529359485,
          "mlp.gate_proj.weight": 0.20992059002641766,
          "mlp.up_proj.weight": 0.22624838112491824,
          "mlp.down_proj.weight": 0.23975704132014206,
          "input_layernorm.weight": 0.9392704038111166,
          "post_attention_layernorm.weight": 0.534603342461067
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.3869070567163679,
          "self_attn.k_proj.weight": 0.41733788169115643,
          "self_attn.v_proj.weight": 0.6819478620941586,
          "self_attn.o_proj.weight": 0.5185018964772606,
          "mlp.gate_proj.weight": 0.3857952544575103,
          "mlp.up_proj.weight": 0.4188764368887535,
          "mlp.down_proj.weight": 0.43345911595128295,
          "input_layernorm.weight": 0.44958074027472317,
          "post_attention_layernorm.weight": 0.47017550272738196
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.41141851886171904,
          "self_attn.k_proj.weight": 0.3273305916332069,
          "self_attn.v_proj.weight": 0.5340342356611809,
          "self_attn.o_proj.weight": 0.479517156619146,
          "mlp.gate_proj.weight": 0.5126061476043234,
          "mlp.up_proj.weight": 0.6602585594995453,
          "mlp.down_proj.weight": 0.6384829779670611,
          "input_layernorm.weight": 0.4515625478959966,
          "post_attention_layernorm.weight": 0.31991898737374685
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.458330232710177,
          "self_attn.k_proj.weight": 0.5289200124472092,
          "self_attn.v_proj.weight": 0.5946075042986118,
          "self_attn.o_proj.weight": 0.4489154493805055,
          "mlp.gate_proj.weight": 0.22742880739738114,
          "mlp.up_proj.weight": 0.26307015782535986,
          "mlp.down_proj.weight": 0.3063196382255173,
          "input_layernorm.weight": 0.9061901333647325,
          "post_attention_layernorm.weight": 0.7022436233626503
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.35959892592956205,
          "self_attn.k_proj.weight": 0.3443886350766098,
          "self_attn.v_proj.weight": 0.5720661391629458,
          "self_attn.o_proj.weight": 0.5818776664595493,
          "mlp.gate_proj.weight": 0.39978735416382805,
          "mlp.up_proj.weight": 0.417749042840734,
          "mlp.down_proj.weight": 0.4387652280731668,
          "input_layernorm.weight": 0.38406718131548756,
          "post_attention_layernorm.weight": 0.6422503049460635
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.636542229290557,
          "self_attn.k_proj.weight": 0.614517102690962,
          "self_attn.v_proj.weight": 0.46813187109223736,
          "self_attn.o_proj.weight": 0.6041227476356936,
          "mlp.gate_proj.weight": 0.40712627903024656,
          "mlp.up_proj.weight": 0.5043431807036443,
          "mlp.down_proj.weight": 0.4505876614150708,
          "input_layernorm.weight": 0.6192326922748669,
          "post_attention_layernorm.weight": 0.6265746279680466
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.34925453885360336,
          "self_attn.k_proj.weight": 0.35278771152971217,
          "self_attn.v_proj.weight": 0.5525661476709113,
          "self_attn.o_proj.weight": 0.5383430454657737,
          "mlp.gate_proj.weight": 0.334643068005848,
          "mlp.up_proj.weight": 0.4327895515323508,
          "mlp.down_proj.weight": 0.4350180162313817,
          "input_layernorm.weight": 0.7771682873929657,
          "post_attention_layernorm.weight": 0.5724920885861154
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5426133527809779,
          "self_attn.k_proj.weight": 0.5917253556706261,
          "self_attn.v_proj.weight": 0.5803218647047271,
          "self_attn.o_proj.weight": 0.573619331665695,
          "mlp.gate_proj.weight": 0.304521619972936,
          "mlp.up_proj.weight": 0.38239116602834844,
          "mlp.down_proj.weight": 0.4294228175148867,
          "input_layernorm.weight": 0.41347205318383795,
          "post_attention_layernorm.weight": 0.4822200725964279
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.30216724988405375,
          "self_attn.k_proj.weight": 0.37973968358516746,
          "self_attn.v_proj.weight": 0.5855414716136844,
          "self_attn.o_proj.weight": 0.5522292406340652,
          "mlp.gate_proj.weight": 0.3154624609232963,
          "mlp.up_proj.weight": 0.4254449945154834,
          "mlp.down_proj.weight": 0.4676055055246573,
          "input_layernorm.weight": 0.6319572678591321,
          "post_attention_layernorm.weight": 0.49370291402388483
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.2122552427048082,
          "self_attn.k_proj.weight": 0.24749176650690055,
          "self_attn.v_proj.weight": 0.4473200679907213,
          "self_attn.o_proj.weight": 0.43028525139201423,
          "mlp.gate_proj.weight": 0.30637712121093663,
          "mlp.up_proj.weight": 0.3902341181994015,
          "mlp.down_proj.weight": 0.4368393972885733,
          "input_layernorm.weight": 0.2106297820601434,
          "post_attention_layernorm.weight": 0.5594815557912957
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.3306107310264498,
          "self_attn.k_proj.weight": 0.3770506265620634,
          "self_attn.v_proj.weight": 0.5762362986478698,
          "self_attn.o_proj.weight": 0.4960079169523212,
          "mlp.gate_proj.weight": 0.2982419943154518,
          "mlp.up_proj.weight": 0.37781123577136094,
          "mlp.down_proj.weight": 0.4191678549276368,
          "input_layernorm.weight": 0.38140716100377725,
          "post_attention_layernorm.weight": 0.49019038978320156
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.09993601246267203,
          "self_attn.k_proj.weight": 0.12521449073239727,
          "self_attn.v_proj.weight": 0.4907779577602679,
          "self_attn.o_proj.weight": 0.42575401537526514,
          "mlp.gate_proj.weight": 0.30881117685577564,
          "mlp.up_proj.weight": 0.36044257082955694,
          "mlp.down_proj.weight": 0.41282750868942547,
          "input_layernorm.weight": 0.5088983795968438,
          "post_attention_layernorm.weight": 0.4496861231516873
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.6602980044411406,
          "self_attn.k_proj.weight": 0.6420444265398998,
          "self_attn.v_proj.weight": 0.5316983977800627,
          "self_attn.o_proj.weight": 0.42227379180088404,
          "mlp.gate_proj.weight": 0.3108678287915571,
          "mlp.up_proj.weight": 0.4041657299501923,
          "mlp.down_proj.weight": 0.43265705494920426,
          "input_layernorm.weight": 0.878605184021867,
          "post_attention_layernorm.weight": 0.5910226949526707
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.5,
          "self_attn.k_proj.weight": 0.2599271947445671,
          "self_attn.v_proj.weight": 0.4975028664708022,
          "self_attn.o_proj.weight": 0.5421607676915916,
          "mlp.gate_proj.weight": 0.3056325201378473,
          "mlp.up_proj.weight": 0.43990546921978413,
          "mlp.down_proj.weight": 0.43404790428518036,
          "input_layernorm.weight": 0.4213949074080623,
          "post_attention_layernorm.weight": 0.09070741198253394
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.05946946364281485,
          "self_attn.k_proj.weight": 0.05838806857596083,
          "self_attn.v_proj.weight": 0.3147092170971509,
          "self_attn.o_proj.weight": 0.17507343342847417,
          "mlp.gate_proj.weight": 0.20693857128512452,
          "mlp.up_proj.weight": 0.17476325022355804,
          "mlp.down_proj.weight": 0.21355502956692002,
          "input_layernorm.weight": 0.049311151072639454,
          "post_attention_layernorm.weight": 0.8488525701289025
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.008338692721690102,
          "self_attn.k_proj.weight": 0.29374913202825975,
          "self_attn.v_proj.weight": 0.6461261275459721,
          "self_attn.o_proj.weight": 0.4819790617006713,
          "mlp.gate_proj.weight": 0.47106954949105545,
          "mlp.up_proj.weight": 0.45120924747821234,
          "mlp.down_proj.weight": 0.49902002059029626,
          "input_layernorm.weight": 0.5308434897842321,
          "post_attention_layernorm.weight": 0.33720525286504943
        },
        "metric": "dwce"
      },
      {
        "num_batches": 64,
        "token_count": 131072.0,
        "norm": "relative",
        "supports_kwargs": true,
        "fuse_priors": {
          "self_attn.q_proj.weight": 0.9961265181629947,
          "self_attn.k_proj.weight": 0.6652207081963631,
          "self_attn.v_proj.weight": 0.6393890973156684,
          "self_attn.o_proj.weight": 0.5993613584307386,
          "mlp.gate_proj.weight": 0.3655389632232587,
          "mlp.up_proj.weight": 0.5096844599903007,
          "mlp.down_proj.weight": 0.4675655218682354,
          "input_layernorm.weight": 0.919265488845913,
          "post_attention_layernorm.weight": 0.21712004107502894
        },
        "metric": "dwce"
      },
      {
        "excluded": true
      }
    ],
    "supports_kwargs": true,
    "max_batches": 0,
    "norm": "relative",
    "metric": "dwce",
    "cosine_topk": 3,
    "start_index": 0,
    "excluded_pairs": [
      0,
      1,
      20
    ]
  },
  "fisher_num_batches": 64,
  "merge_method": "reparam",
  "merged_params": 3,
  "num_sequences": 128,
  "teacher_source": "previous_cycle_memory",
  "teacher_cycle": 10,
  "eval": {
    "datasets": [
      "wikitext"
    ],
    "configs": [
      "wikitext-2-raw-v1"
    ],
    "split": "test",
    "num_samples": 200,
    "seq_len": 1024,
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 38.01892516739481
    }
  },
  "comm": {
    "enabled": true,
    "train_mode": "lora",
    "opt_steps": 6,
    "grad_accum_steps": 1,
    "lr": 1e-05,
    "temp": 2.0,
    "steps_ratio": 0.1,
    "lr_scale": 0.1,
    "interaction_mode": "relative",
    "interaction_eps": 1e-08,
    "mu": 0.5,
    "mu_auto": true,
    "mu_auto_rho": 0.1,
    "mu_auto_eps": 1e-08,
    "sample_eta": 0.5,
    "sample_dwce_scale": 1.0,
    "topk": 1,
    "candidate_pairs": [
      16
    ],
    "trainable_params": 14417920,
    "pair_selection": {
      "num_pairs": 21,
      "excluded_pairs": [
        0,
        1,
        20
      ],
      "candidate_pairs": [
        16
      ],
      "total_samples": 6,
      "unique_pairs": 1,
      "counts": [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        6,
        0,
        0,
        0,
        0
      ],
      "freqs": [
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0
      ],
      "probs": [
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        1.0,
        0.0,
        0.0,
        0.0,
        0.0
      ],
      "top_pairs": [
        {
          "pair": 16,
          "count": 6,
          "freq": 1.0,
          "prob": 1.0
        }
      ]
    },
    "avg_loss": 0.0004086560936388632,
    "avg_anchor": 0.00023934131847151244,
    "avg_interaction": 0.0413870836297671,
    "avg_mu": 0.0037876796102222893,
    "teacher_source": "previous_cycle_memory",
    "teacher_cycle": 10,
    "dwce_scores_pre": [
      Infinity,
      Infinity,
      0.08755428350373622,
      0.09632033766022144,
      0.12102187652892513,
      0.07687722047289398,
      0.07646200702022135,
      0.10762881995392454,
      0.11769039904744791,
      0.13281737352618223,
      0.13359121989702824,
      0.12213174133312057,
      0.09479378554179792,
      0.0702238861787414,
      0.09607953229105111,
      0.1022260119464429,
      0.04662290344732959,
      0.08633913939177026,
      0.14327522026540693,
      0.14370887781314598,
      Infinity
    ],
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 34.96986894820446
    },
    "post_selection_recomputed": false,
    "selected_layer_post": 16
  },
  "distill": {
    "enabled": true,
    "method": "reparam",
    "calib_samples": 256,
    "inst_samples": 0,
    "seq_len": 1024,
    "batch_size": 1,
    "epochs": 1.0,
    "lr": 0.0001,
    "kl_weight": 0.02,
    "kl_temp": 4.0,
    "hidden_mse_weight": 1.0,
    "attn_mse_weight": 0.25,
    "mlp_mse_weight": 1.0,
    "reparam_eta": 0.0,
    "reparam_gamma": 0.0,
    "reparam_attn_reg_scale": 1.0,
    "reparam_mlp_reg_scale": 1.0,
    "reparam_param_subset": "mlp",
    "reparam_stats": {
      "enabled": true,
      "epochs": 1.0,
      "lr": 0.0001,
      "hidden_mse_weight": 1.0,
      "attn_mse_weight": 0.25,
      "mlp_mse_weight": 1.0,
      "eta": 0.0,
      "gamma": 0.0,
      "attn_reg_scale": 1.0,
      "mlp_reg_scale": 1.0,
      "param_subset": "mlp",
      "num_gates": 3,
      "num_attn_gates": 0,
      "num_mlp_gates": 3,
      "num_other_gates": 0,
      "lambda_init": "fisher_prior"
    },
    "reparam_gate_summary": {
      "num_tensors": 3,
      "num_elements": 176160768,
      "global_mean": 0.5134112267267137,
      "per_tensor_mean": {
        "mlp.gate_proj.weight": 0.5045510530471802,
        "mlp.up_proj.weight": 0.5129446983337402,
        "mlp.down_proj.weight": 0.522737979888916
      }
    },
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 38.01892516739481
    },
    "weight_decay": 0.0,
    "max_grad_norm": 1.0,
    "grad_accum_steps": 1,
    "instruction_dataset": null,
    "instruction_config": null,
    "instruction_split": "train"
  },
  "lora": {
    "enabled": false,
    "seq_len": 1024,
    "batch_size": 1,
    "epochs": 0.0,
    "rank": 8,
    "alpha": 16.0,
    "dropout": 0.0,
    "target_modules": [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj",
      "gate_proj",
      "down_proj",
      "up_proj"
    ],
    "respect_exclude_pairs": false,
    "kl_enabled": false,
    "kl_weight": 0.1,
    "kl_temp": 4.0,
    "post_ppl": null,
    "lr": 0.0001,
    "weight_decay": 0.0,
    "max_grad_norm": 1.0,
    "grad_accum_steps": 1,
    "log_steps": 100,
    "eval_every": 2000,
    "eval_max_batches": null
  },
  "norm_policy": "hybrid",
  "full_model_saved": true,
  "full_model": {
    "stage": "cycle_11_full_model",
    "path": "llama3_mlp_skipdwce_nopermute/cycle_11/full_model",
    "weight_bytes": 11279358471,
    "post_ppl": {
      "wikitext:wikitext-2-raw-v1": 38.01892516739481
    },
    "resume_info": "resume_info.json"
  }
}