Invalid JSON: Unexpected token 'I', ..."s": [
Infinity,
"... is not valid JSON
| { | |
| "cycle": 6, | |
| "layer_merged": 21, | |
| "num_layers_before": 27, | |
| "num_layers_after": 26, | |
| "fused_layer_state": "fused_layer.pt", | |
| "dwce_score": 0.060528135887943944, | |
| "dwce_scores": [ | |
| Infinity, | |
| Infinity, | |
| 0.10885138210341537, | |
| 0.09962797171315212, | |
| 0.09704690759872063, | |
| 0.10032807292174113, | |
| 0.07477073061165104, | |
| 0.06816870958942334, | |
| 0.08890422275158102, | |
| 0.1163330007130214, | |
| 0.14856900172908535, | |
| 0.15104770110436222, | |
| 0.15976559604817797, | |
| 0.16747390251227062, | |
| 0.17331518344021776, | |
| 0.14107504898742165, | |
| 0.12868654620236872, | |
| 0.13613157978454246, | |
| 0.1218191001657377, | |
| 0.10902217379118222, | |
| 0.08936219242296228, | |
| 0.060528135887943944, | |
| 0.07539605212866492, | |
| 0.09116655794274726, | |
| 0.11185780392379165, | |
| Infinity | |
| ], | |
| "dwce_meta": { | |
| "per_pair": [ | |
| { | |
| "excluded": true | |
| }, | |
| { | |
| "excluded": true | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.45602849053786343, | |
| "self_attn.k_proj.weight": 0.393914402513139, | |
| "self_attn.v_proj.weight": 0.540396339894555, | |
| "self_attn.o_proj.weight": 0.4469669933138352, | |
| "mlp.gate_proj.weight": 0.3287149601078028, | |
| "mlp.up_proj.weight": 0.35017769317939573, | |
| "mlp.down_proj.weight": 0.396783937778306, | |
| "input_layernorm.weight": 0.17714660120924552, | |
| "post_attention_layernorm.weight": 0.5706843420160465 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.2956975268685422, | |
| "self_attn.k_proj.weight": 0.3772577526025228, | |
| "self_attn.v_proj.weight": 0.5824193735130013, | |
| "self_attn.o_proj.weight": 0.44336875810997856, | |
| "mlp.gate_proj.weight": 0.30769135298804196, | |
| "mlp.up_proj.weight": 0.35382330315519916, | |
| "mlp.down_proj.weight": 0.37185043779817395, | |
| "input_layernorm.weight": 0.7024250477029108, | |
| "post_attention_layernorm.weight": 0.5121384967983699 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.30319082595336366, | |
| "self_attn.k_proj.weight": 0.3492474196377044, | |
| "self_attn.v_proj.weight": 0.569890311972998, | |
| "self_attn.o_proj.weight": 0.40464328058154025, | |
| "mlp.gate_proj.weight": 0.28532597693591105, | |
| "mlp.up_proj.weight": 0.37282024323447943, | |
| "mlp.down_proj.weight": 0.3841916217498055, | |
| "input_layernorm.weight": 0.4863317106902496, | |
| "post_attention_layernorm.weight": 0.57501372396618 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.30239571422605777, | |
| "self_attn.k_proj.weight": 0.28777982633775234, | |
| "self_attn.v_proj.weight": 0.5214361921442343, | |
| "self_attn.o_proj.weight": 0.43105923012955555, | |
| "mlp.gate_proj.weight": 0.2730870548532783, | |
| "mlp.up_proj.weight": 0.3623753277725939, | |
| "mlp.down_proj.weight": 0.38708954857683764, | |
| "input_layernorm.weight": 0.4643158587476409, | |
| "post_attention_layernorm.weight": 0.4712457550840951 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.34829974358469984, | |
| "self_attn.k_proj.weight": 0.3124058890492968, | |
| "self_attn.v_proj.weight": 0.44195795428146895, | |
| "self_attn.o_proj.weight": 0.4143407830271052, | |
| "mlp.gate_proj.weight": 0.5, | |
| "mlp.up_proj.weight": 0.5509473359135564, | |
| "mlp.down_proj.weight": 0.5172651880216964, | |
| "input_layernorm.weight": 0.6178273084347614, | |
| "post_attention_layernorm.weight": 0.34427816773879094 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.2730859710153346, | |
| "self_attn.k_proj.weight": 0.3061184297688129, | |
| "self_attn.v_proj.weight": 0.533020418936679, | |
| "self_attn.o_proj.weight": 0.4119345004856529, | |
| "mlp.gate_proj.weight": 0.5, | |
| "mlp.up_proj.weight": 0.13066309879562818, | |
| "mlp.down_proj.weight": 0.2330601154961996, | |
| "input_layernorm.weight": 0.8059542094784746, | |
| "post_attention_layernorm.weight": 0.6625755866545766 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.48768939039675996, | |
| "self_attn.k_proj.weight": 0.4777739962444542, | |
| "self_attn.v_proj.weight": 0.5103851366152496, | |
| "self_attn.o_proj.weight": 0.43250945492739656, | |
| "mlp.gate_proj.weight": 0.3257707165607638, | |
| "mlp.up_proj.weight": 0.4096843334038478, | |
| "mlp.down_proj.weight": 0.34885861400447404, | |
| "input_layernorm.weight": 0.4200429536840167, | |
| "post_attention_layernorm.weight": 0.5367871634336786 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.29755787633442426, | |
| "self_attn.k_proj.weight": 0.37623755403743336, | |
| "self_attn.v_proj.weight": 0.6136683116996197, | |
| "self_attn.o_proj.weight": 0.5741847831544276, | |
| "mlp.gate_proj.weight": 0.27139084405979624, | |
| "mlp.up_proj.weight": 0.352502726534821, | |
| "mlp.down_proj.weight": 0.3594937163598074, | |
| "input_layernorm.weight": 0.5244406580066927, | |
| "post_attention_layernorm.weight": 0.5383591174280284 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.48784027196921265, | |
| "self_attn.k_proj.weight": 0.39846873275053135, | |
| "self_attn.v_proj.weight": 0.4618083281504365, | |
| "self_attn.o_proj.weight": 0.5396497221340433, | |
| "mlp.gate_proj.weight": 0.28731097517799203, | |
| "mlp.up_proj.weight": 0.37777335079811836, | |
| "mlp.down_proj.weight": 0.38422114872976587, | |
| "input_layernorm.weight": 0.5041621307381597, | |
| "post_attention_layernorm.weight": 0.5670639954113798 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.23827875940660836, | |
| "self_attn.k_proj.weight": 0.3238894105610408, | |
| "self_attn.v_proj.weight": 0.5690090563700821, | |
| "self_attn.o_proj.weight": 0.5450013202392769, | |
| "mlp.gate_proj.weight": 0.27271401584431, | |
| "mlp.up_proj.weight": 0.3708806850281214, | |
| "mlp.down_proj.weight": 0.39500743275208433, | |
| "input_layernorm.weight": 0.6987078362557021, | |
| "post_attention_layernorm.weight": 0.5374434799165528 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.3946032688367209, | |
| "self_attn.k_proj.weight": 0.40690493743203565, | |
| "self_attn.v_proj.weight": 0.5164469910119862, | |
| "self_attn.o_proj.weight": 0.4597598293804492, | |
| "mlp.gate_proj.weight": 0.269573396995867, | |
| "mlp.up_proj.weight": 0.35366279000533957, | |
| "mlp.down_proj.weight": 0.3877383926763193, | |
| "input_layernorm.weight": 0.5460628958641058, | |
| "post_attention_layernorm.weight": 0.5071179303686918 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.3008175631152415, | |
| "self_attn.v_proj.weight": 0.5493743908332103, | |
| "self_attn.o_proj.weight": 0.4661606538187534, | |
| "mlp.gate_proj.weight": 0.2783245620057637, | |
| "mlp.up_proj.weight": 0.36798475292532096, | |
| "mlp.down_proj.weight": 0.41990780468185046, | |
| "input_layernorm.weight": 0.3526699737302493, | |
| "post_attention_layernorm.weight": 0.5026090703676459 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.5, | |
| "self_attn.v_proj.weight": 0.5046790280321507, | |
| "self_attn.o_proj.weight": 0.47862372917098306, | |
| "mlp.gate_proj.weight": 0.28429170701639556, | |
| "mlp.up_proj.weight": 0.3568216404807901, | |
| "mlp.down_proj.weight": 0.4024126479370246, | |
| "input_layernorm.weight": 0.6964693773965374, | |
| "post_attention_layernorm.weight": 0.5389671586153284 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.5, | |
| "self_attn.v_proj.weight": 0.5711961200240154, | |
| "self_attn.o_proj.weight": 0.4091136757892634, | |
| "mlp.gate_proj.weight": 0.28165197173200957, | |
| "mlp.up_proj.weight": 0.3558940378823637, | |
| "mlp.down_proj.weight": 0.3944025471751477, | |
| "input_layernorm.weight": 0.34554249375842755, | |
| "post_attention_layernorm.weight": 0.5083749806126059 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.5, | |
| "self_attn.v_proj.weight": 0.4798904930258487, | |
| "self_attn.o_proj.weight": 0.3796993330110259, | |
| "mlp.gate_proj.weight": 0.2834371362960443, | |
| "mlp.up_proj.weight": 0.34598115404819363, | |
| "mlp.down_proj.weight": 0.3810985843525749, | |
| "input_layernorm.weight": 0.40754399908659844, | |
| "post_attention_layernorm.weight": 0.497314728281144 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.5, | |
| "self_attn.v_proj.weight": 0.5081409141199547, | |
| "self_attn.o_proj.weight": 0.3629102582375656, | |
| "mlp.gate_proj.weight": 0.2855739089361027, | |
| "mlp.up_proj.weight": 0.35880372082662226, | |
| "mlp.down_proj.weight": 0.38944742416779293, | |
| "input_layernorm.weight": 0.6608006181001739, | |
| "post_attention_layernorm.weight": 0.5296959070502865 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.5, | |
| "self_attn.v_proj.weight": 0.49572269146465, | |
| "self_attn.o_proj.weight": 0.32994298892262397, | |
| "mlp.gate_proj.weight": 0.27769206189679957, | |
| "mlp.up_proj.weight": 0.3455361882658386, | |
| "mlp.down_proj.weight": 0.3735752249423331, | |
| "input_layernorm.weight": 0.4170270576790789, | |
| "post_attention_layernorm.weight": 0.5209363007605031 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.5, | |
| "self_attn.v_proj.weight": 0.4867919706742694, | |
| "self_attn.o_proj.weight": 0.3538593097308571, | |
| "mlp.gate_proj.weight": 0.27117298636599874, | |
| "mlp.up_proj.weight": 0.33462896686497223, | |
| "mlp.down_proj.weight": 0.3612770373233193, | |
| "input_layernorm.weight": 0.6936974577548146, | |
| "post_attention_layernorm.weight": 0.5001178185439411 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.09349704918366467, | |
| "self_attn.k_proj.weight": 0.5, | |
| "self_attn.v_proj.weight": 0.47712484653665943, | |
| "self_attn.o_proj.weight": 0.29886608310554436, | |
| "mlp.gate_proj.weight": 0.2693949444903617, | |
| "mlp.up_proj.weight": 0.3291873676707558, | |
| "mlp.down_proj.weight": 0.35004469298030644, | |
| "input_layernorm.weight": 0.33369843125590304, | |
| "post_attention_layernorm.weight": 0.5011505559153778 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.40387717352354646, | |
| "self_attn.k_proj.weight": 0.32231430163408453, | |
| "self_attn.v_proj.weight": 0.42154930635885873, | |
| "self_attn.o_proj.weight": 0.2721077912952483, | |
| "mlp.gate_proj.weight": 0.5, | |
| "mlp.up_proj.weight": 0.40582628271960014, | |
| "mlp.down_proj.weight": 0.3570913409001458, | |
| "input_layernorm.weight": 0.5122671690645244, | |
| "post_attention_layernorm.weight": 0.49147814328550515 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.1520219921182753, | |
| "self_attn.k_proj.weight": 0.13721084571035752, | |
| "self_attn.v_proj.weight": 0.40551897093604916, | |
| "self_attn.o_proj.weight": 0.280649542041515, | |
| "mlp.gate_proj.weight": 0.5, | |
| "mlp.up_proj.weight": 0.14720318423611445, | |
| "mlp.down_proj.weight": 0.17429299506277318, | |
| "input_layernorm.weight": 0.09607062192796464, | |
| "post_attention_layernorm.weight": 0.44690356138952886 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.007074450011806872, | |
| "self_attn.k_proj.weight": 0.1615631246455786, | |
| "self_attn.v_proj.weight": 0.5877874751289588, | |
| "self_attn.o_proj.weight": 0.3278764246216598, | |
| "mlp.gate_proj.weight": 0.313893134211228, | |
| "mlp.up_proj.weight": 0.3528312375210034, | |
| "mlp.down_proj.weight": 0.35640966437698607, | |
| "input_layernorm.weight": 0.6657340014175253, | |
| "post_attention_layernorm.weight": 0.39247642201648 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.9892781643373502, | |
| "self_attn.k_proj.weight": 0.657748170777267, | |
| "self_attn.v_proj.weight": 0.4684578019872222, | |
| "self_attn.o_proj.weight": 0.4193140384970065, | |
| "mlp.gate_proj.weight": 0.5, | |
| "mlp.up_proj.weight": 0.3405968802707512, | |
| "mlp.down_proj.weight": 0.3304433709433969, | |
| "input_layernorm.weight": 0.6733460076353963, | |
| "post_attention_layernorm.weight": 0.15409983400561383 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "excluded": true | |
| } | |
| ], | |
| "supports_kwargs": true, | |
| "max_batches": 0, | |
| "norm": "relative", | |
| "metric": "dwce", | |
| "cosine_topk": 3, | |
| "start_index": 0, | |
| "excluded_pairs": [ | |
| 0, | |
| 1, | |
| 25 | |
| ] | |
| }, | |
| "fisher_num_batches": 64, | |
| "merge_method": "reparam", | |
| "merged_params": 3, | |
| "num_sequences": 128, | |
| "teacher_source": "previous_cycle_memory", | |
| "teacher_cycle": 5, | |
| "eval": { | |
| "datasets": [ | |
| "wikitext" | |
| ], | |
| "configs": [ | |
| "wikitext-2-raw-v1" | |
| ], | |
| "split": "test", | |
| "num_samples": 200, | |
| "seq_len": 1024, | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 19.929857094583117 | |
| } | |
| }, | |
| "comm": { | |
| "enabled": true, | |
| "train_mode": "lora", | |
| "opt_steps": 6, | |
| "grad_accum_steps": 1, | |
| "lr": 1e-05, | |
| "temp": 2.0, | |
| "steps_ratio": 0.1, | |
| "lr_scale": 0.1, | |
| "interaction_mode": "relative", | |
| "interaction_eps": 1e-08, | |
| "mu": 0.5, | |
| "mu_auto": true, | |
| "mu_auto_rho": 0.1, | |
| "mu_auto_eps": 1e-08, | |
| "sample_eta": 0.5, | |
| "sample_dwce_scale": 1.0, | |
| "topk": 1, | |
| "candidate_pairs": [ | |
| 21 | |
| ], | |
| "trainable_params": 17694720, | |
| "pair_selection": { | |
| "num_pairs": 26, | |
| "excluded_pairs": [ | |
| 0, | |
| 1, | |
| 25 | |
| ], | |
| "candidate_pairs": [ | |
| 21 | |
| ], | |
| "total_samples": 6, | |
| "unique_pairs": 1, | |
| "counts": [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 6, | |
| 0, | |
| 0, | |
| 0, | |
| 0 | |
| ], | |
| "freqs": [ | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 1.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0 | |
| ], | |
| "probs": [ | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 1.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0 | |
| ], | |
| "top_pairs": [ | |
| { | |
| "pair": 21, | |
| "count": 6, | |
| "freq": 1.0, | |
| "prob": 1.0 | |
| } | |
| ] | |
| }, | |
| "avg_loss": 0.0004060502259892, | |
| "avg_anchor": 0.0002513490423249702, | |
| "avg_interaction": 0.09605961292982101, | |
| "avg_mu": 0.0015758845697112923, | |
| "teacher_source": "previous_cycle_memory", | |
| "teacher_cycle": 5, | |
| "dwce_scores_pre": [ | |
| Infinity, | |
| Infinity, | |
| 0.10885138210341537, | |
| 0.09962797171315212, | |
| 0.09704690759872063, | |
| 0.10032807292174113, | |
| 0.07477073061165104, | |
| 0.06816870958942334, | |
| 0.08890422275158102, | |
| 0.1163330007130214, | |
| 0.14856900172908535, | |
| 0.15104770110436222, | |
| 0.15976559604817797, | |
| 0.16747390251227062, | |
| 0.17331518344021776, | |
| 0.14107504898742165, | |
| 0.12868654620236872, | |
| 0.13613157978454246, | |
| 0.1218191001657377, | |
| 0.10902217379118222, | |
| 0.08936219242296228, | |
| 0.060528135887943944, | |
| 0.07539605212866492, | |
| 0.09116655794274726, | |
| 0.11185780392379165, | |
| Infinity | |
| ], | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 18.54056249689346 | |
| }, | |
| "post_selection_recomputed": false, | |
| "selected_layer_post": 21 | |
| }, | |
| "distill": { | |
| "enabled": true, | |
| "method": "reparam", | |
| "calib_samples": 256, | |
| "inst_samples": 0, | |
| "seq_len": 1024, | |
| "batch_size": 1, | |
| "epochs": 1.0, | |
| "lr": 0.0001, | |
| "kl_weight": 0.02, | |
| "kl_temp": 4.0, | |
| "hidden_mse_weight": 1.0, | |
| "attn_mse_weight": 0.25, | |
| "mlp_mse_weight": 1.0, | |
| "reparam_eta": 0.0, | |
| "reparam_gamma": 0.0, | |
| "reparam_attn_reg_scale": 1.0, | |
| "reparam_mlp_reg_scale": 1.0, | |
| "reparam_param_subset": "mlp", | |
| "reparam_stats": { | |
| "enabled": true, | |
| "epochs": 1.0, | |
| "lr": 0.0001, | |
| "hidden_mse_weight": 1.0, | |
| "attn_mse_weight": 0.25, | |
| "mlp_mse_weight": 1.0, | |
| "eta": 0.0, | |
| "gamma": 0.0, | |
| "attn_reg_scale": 1.0, | |
| "mlp_reg_scale": 1.0, | |
| "param_subset": "mlp", | |
| "num_gates": 3, | |
| "num_attn_gates": 0, | |
| "num_mlp_gates": 3, | |
| "num_other_gates": 0, | |
| "lambda_init": "fisher_prior" | |
| }, | |
| "reparam_gate_summary": { | |
| "num_tensors": 3, | |
| "num_elements": 176160768, | |
| "global_mean": 0.48417738505772184, | |
| "per_tensor_mean": { | |
| "mlp.gate_proj.weight": 0.4960911273956299, | |
| "mlp.up_proj.weight": 0.4809410572052002, | |
| "mlp.down_proj.weight": 0.47550004720687866 | |
| } | |
| }, | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 19.929857094583117 | |
| }, | |
| "weight_decay": 0.0, | |
| "max_grad_norm": 1.0, | |
| "grad_accum_steps": 1, | |
| "instruction_dataset": null, | |
| "instruction_config": null, | |
| "instruction_split": "train" | |
| }, | |
| "lora": { | |
| "enabled": false, | |
| "seq_len": 1024, | |
| "batch_size": 1, | |
| "epochs": 0.0, | |
| "rank": 8, | |
| "alpha": 16.0, | |
| "dropout": 0.0, | |
| "target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "down_proj", | |
| "up_proj" | |
| ], | |
| "respect_exclude_pairs": false, | |
| "kl_enabled": false, | |
| "kl_weight": 0.1, | |
| "kl_temp": 4.0, | |
| "post_ppl": null, | |
| "lr": 0.0001, | |
| "weight_decay": 0.0, | |
| "max_grad_norm": 1.0, | |
| "grad_accum_steps": 1, | |
| "log_steps": 100, | |
| "eval_every": 2000, | |
| "eval_max_batches": null | |
| }, | |
| "norm_policy": "hybrid", | |
| "full_model_saved": true, | |
| "full_model": { | |
| "stage": "cycle_6_full_model", | |
| "path": "llama3_mlp_skipdwce_nopermute/cycle_6/full_model", | |
| "weight_bytes": 13460487442, | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 19.929857094583117 | |
| }, | |
| "resume_info": "resume_info.json" | |
| } | |
| } |