{ "cycle": 6, "layer_merged": 21, "num_layers_before": 27, "num_layers_after": 26, "fused_layer_state": "fused_layer.pt", "dwce_score": 0.060528135887943944, "dwce_scores": [ Infinity, Infinity, 0.10885138210341537, 0.09962797171315212, 0.09704690759872063, 0.10032807292174113, 0.07477073061165104, 0.06816870958942334, 0.08890422275158102, 0.1163330007130214, 0.14856900172908535, 0.15104770110436222, 0.15976559604817797, 0.16747390251227062, 0.17331518344021776, 0.14107504898742165, 0.12868654620236872, 0.13613157978454246, 0.1218191001657377, 0.10902217379118222, 0.08936219242296228, 0.060528135887943944, 0.07539605212866492, 0.09116655794274726, 0.11185780392379165, Infinity ], "dwce_meta": { "per_pair": [ { "excluded": true }, { "excluded": true }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.45602849053786343, "self_attn.k_proj.weight": 0.393914402513139, "self_attn.v_proj.weight": 0.540396339894555, "self_attn.o_proj.weight": 0.4469669933138352, "mlp.gate_proj.weight": 0.3287149601078028, "mlp.up_proj.weight": 0.35017769317939573, "mlp.down_proj.weight": 0.396783937778306, "input_layernorm.weight": 0.17714660120924552, "post_attention_layernorm.weight": 0.5706843420160465 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.2956975268685422, "self_attn.k_proj.weight": 0.3772577526025228, "self_attn.v_proj.weight": 0.5824193735130013, "self_attn.o_proj.weight": 0.44336875810997856, "mlp.gate_proj.weight": 0.30769135298804196, "mlp.up_proj.weight": 0.35382330315519916, "mlp.down_proj.weight": 0.37185043779817395, "input_layernorm.weight": 0.7024250477029108, "post_attention_layernorm.weight": 0.5121384967983699 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.30319082595336366, "self_attn.k_proj.weight": 0.3492474196377044, "self_attn.v_proj.weight": 0.569890311972998, "self_attn.o_proj.weight": 0.40464328058154025, "mlp.gate_proj.weight": 0.28532597693591105, "mlp.up_proj.weight": 0.37282024323447943, "mlp.down_proj.weight": 0.3841916217498055, "input_layernorm.weight": 0.4863317106902496, "post_attention_layernorm.weight": 0.57501372396618 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.30239571422605777, "self_attn.k_proj.weight": 0.28777982633775234, "self_attn.v_proj.weight": 0.5214361921442343, "self_attn.o_proj.weight": 0.43105923012955555, "mlp.gate_proj.weight": 0.2730870548532783, "mlp.up_proj.weight": 0.3623753277725939, "mlp.down_proj.weight": 0.38708954857683764, "input_layernorm.weight": 0.4643158587476409, "post_attention_layernorm.weight": 0.4712457550840951 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.34829974358469984, "self_attn.k_proj.weight": 0.3124058890492968, "self_attn.v_proj.weight": 0.44195795428146895, "self_attn.o_proj.weight": 0.4143407830271052, "mlp.gate_proj.weight": 0.5, "mlp.up_proj.weight": 0.5509473359135564, "mlp.down_proj.weight": 0.5172651880216964, "input_layernorm.weight": 0.6178273084347614, "post_attention_layernorm.weight": 0.34427816773879094 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.2730859710153346, "self_attn.k_proj.weight": 0.3061184297688129, "self_attn.v_proj.weight": 0.533020418936679, "self_attn.o_proj.weight": 0.4119345004856529, "mlp.gate_proj.weight": 0.5, "mlp.up_proj.weight": 0.13066309879562818, "mlp.down_proj.weight": 0.2330601154961996, "input_layernorm.weight": 0.8059542094784746, "post_attention_layernorm.weight": 0.6625755866545766 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.48768939039675996, "self_attn.k_proj.weight": 0.4777739962444542, "self_attn.v_proj.weight": 0.5103851366152496, "self_attn.o_proj.weight": 0.43250945492739656, "mlp.gate_proj.weight": 0.3257707165607638, "mlp.up_proj.weight": 0.4096843334038478, "mlp.down_proj.weight": 0.34885861400447404, "input_layernorm.weight": 0.4200429536840167, "post_attention_layernorm.weight": 0.5367871634336786 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.29755787633442426, "self_attn.k_proj.weight": 0.37623755403743336, "self_attn.v_proj.weight": 0.6136683116996197, "self_attn.o_proj.weight": 0.5741847831544276, "mlp.gate_proj.weight": 0.27139084405979624, "mlp.up_proj.weight": 0.352502726534821, "mlp.down_proj.weight": 0.3594937163598074, "input_layernorm.weight": 0.5244406580066927, "post_attention_layernorm.weight": 0.5383591174280284 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.48784027196921265, "self_attn.k_proj.weight": 0.39846873275053135, "self_attn.v_proj.weight": 0.4618083281504365, "self_attn.o_proj.weight": 0.5396497221340433, "mlp.gate_proj.weight": 0.28731097517799203, "mlp.up_proj.weight": 0.37777335079811836, "mlp.down_proj.weight": 0.38422114872976587, "input_layernorm.weight": 0.5041621307381597, "post_attention_layernorm.weight": 0.5670639954113798 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.23827875940660836, "self_attn.k_proj.weight": 0.3238894105610408, "self_attn.v_proj.weight": 0.5690090563700821, "self_attn.o_proj.weight": 0.5450013202392769, "mlp.gate_proj.weight": 0.27271401584431, "mlp.up_proj.weight": 0.3708806850281214, "mlp.down_proj.weight": 0.39500743275208433, "input_layernorm.weight": 0.6987078362557021, "post_attention_layernorm.weight": 0.5374434799165528 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.3946032688367209, "self_attn.k_proj.weight": 0.40690493743203565, "self_attn.v_proj.weight": 0.5164469910119862, "self_attn.o_proj.weight": 0.4597598293804492, "mlp.gate_proj.weight": 0.269573396995867, "mlp.up_proj.weight": 0.35366279000533957, "mlp.down_proj.weight": 0.3877383926763193, "input_layernorm.weight": 0.5460628958641058, "post_attention_layernorm.weight": 0.5071179303686918 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.3008175631152415, "self_attn.v_proj.weight": 0.5493743908332103, "self_attn.o_proj.weight": 0.4661606538187534, "mlp.gate_proj.weight": 0.2783245620057637, "mlp.up_proj.weight": 0.36798475292532096, "mlp.down_proj.weight": 0.41990780468185046, "input_layernorm.weight": 0.3526699737302493, "post_attention_layernorm.weight": 0.5026090703676459 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.5, "self_attn.v_proj.weight": 0.5046790280321507, "self_attn.o_proj.weight": 0.47862372917098306, "mlp.gate_proj.weight": 0.28429170701639556, "mlp.up_proj.weight": 0.3568216404807901, "mlp.down_proj.weight": 0.4024126479370246, "input_layernorm.weight": 0.6964693773965374, "post_attention_layernorm.weight": 0.5389671586153284 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.5, "self_attn.v_proj.weight": 0.5711961200240154, "self_attn.o_proj.weight": 0.4091136757892634, "mlp.gate_proj.weight": 0.28165197173200957, "mlp.up_proj.weight": 0.3558940378823637, "mlp.down_proj.weight": 0.3944025471751477, "input_layernorm.weight": 0.34554249375842755, "post_attention_layernorm.weight": 0.5083749806126059 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.5, "self_attn.v_proj.weight": 0.4798904930258487, "self_attn.o_proj.weight": 0.3796993330110259, "mlp.gate_proj.weight": 0.2834371362960443, "mlp.up_proj.weight": 0.34598115404819363, "mlp.down_proj.weight": 0.3810985843525749, "input_layernorm.weight": 0.40754399908659844, "post_attention_layernorm.weight": 0.497314728281144 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.5, "self_attn.v_proj.weight": 0.5081409141199547, "self_attn.o_proj.weight": 0.3629102582375656, "mlp.gate_proj.weight": 0.2855739089361027, "mlp.up_proj.weight": 0.35880372082662226, "mlp.down_proj.weight": 0.38944742416779293, "input_layernorm.weight": 0.6608006181001739, "post_attention_layernorm.weight": 0.5296959070502865 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.5, "self_attn.v_proj.weight": 0.49572269146465, "self_attn.o_proj.weight": 0.32994298892262397, "mlp.gate_proj.weight": 0.27769206189679957, "mlp.up_proj.weight": 0.3455361882658386, "mlp.down_proj.weight": 0.3735752249423331, "input_layernorm.weight": 0.4170270576790789, "post_attention_layernorm.weight": 0.5209363007605031 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.5, "self_attn.v_proj.weight": 0.4867919706742694, "self_attn.o_proj.weight": 0.3538593097308571, "mlp.gate_proj.weight": 0.27117298636599874, "mlp.up_proj.weight": 0.33462896686497223, "mlp.down_proj.weight": 0.3612770373233193, "input_layernorm.weight": 0.6936974577548146, "post_attention_layernorm.weight": 0.5001178185439411 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.09349704918366467, "self_attn.k_proj.weight": 0.5, "self_attn.v_proj.weight": 0.47712484653665943, "self_attn.o_proj.weight": 0.29886608310554436, "mlp.gate_proj.weight": 0.2693949444903617, "mlp.up_proj.weight": 0.3291873676707558, "mlp.down_proj.weight": 0.35004469298030644, "input_layernorm.weight": 0.33369843125590304, "post_attention_layernorm.weight": 0.5011505559153778 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.40387717352354646, "self_attn.k_proj.weight": 0.32231430163408453, "self_attn.v_proj.weight": 0.42154930635885873, "self_attn.o_proj.weight": 0.2721077912952483, "mlp.gate_proj.weight": 0.5, "mlp.up_proj.weight": 0.40582628271960014, "mlp.down_proj.weight": 0.3570913409001458, "input_layernorm.weight": 0.5122671690645244, "post_attention_layernorm.weight": 0.49147814328550515 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.1520219921182753, "self_attn.k_proj.weight": 0.13721084571035752, "self_attn.v_proj.weight": 0.40551897093604916, "self_attn.o_proj.weight": 0.280649542041515, "mlp.gate_proj.weight": 0.5, "mlp.up_proj.weight": 0.14720318423611445, "mlp.down_proj.weight": 0.17429299506277318, "input_layernorm.weight": 0.09607062192796464, "post_attention_layernorm.weight": 0.44690356138952886 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.007074450011806872, "self_attn.k_proj.weight": 0.1615631246455786, "self_attn.v_proj.weight": 0.5877874751289588, "self_attn.o_proj.weight": 0.3278764246216598, "mlp.gate_proj.weight": 0.313893134211228, "mlp.up_proj.weight": 0.3528312375210034, "mlp.down_proj.weight": 0.35640966437698607, "input_layernorm.weight": 0.6657340014175253, "post_attention_layernorm.weight": 0.39247642201648 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.9892781643373502, "self_attn.k_proj.weight": 0.657748170777267, "self_attn.v_proj.weight": 0.4684578019872222, "self_attn.o_proj.weight": 0.4193140384970065, "mlp.gate_proj.weight": 0.5, "mlp.up_proj.weight": 0.3405968802707512, "mlp.down_proj.weight": 0.3304433709433969, "input_layernorm.weight": 0.6733460076353963, "post_attention_layernorm.weight": 0.15409983400561383 }, "metric": "dwce" }, { "excluded": true } ], "supports_kwargs": true, "max_batches": 0, "norm": "relative", "metric": "dwce", "cosine_topk": 3, "start_index": 0, "excluded_pairs": [ 0, 1, 25 ] }, "fisher_num_batches": 64, "merge_method": "reparam", "merged_params": 3, "num_sequences": 128, "teacher_source": "previous_cycle_memory", "teacher_cycle": 5, "eval": { "datasets": [ "wikitext" ], "configs": [ "wikitext-2-raw-v1" ], "split": "test", "num_samples": 200, "seq_len": 1024, "post_ppl": { "wikitext:wikitext-2-raw-v1": 19.929857094583117 } }, "comm": { "enabled": true, "train_mode": "lora", "opt_steps": 6, "grad_accum_steps": 1, "lr": 1e-05, "temp": 2.0, "steps_ratio": 0.1, "lr_scale": 0.1, "interaction_mode": "relative", "interaction_eps": 1e-08, "mu": 0.5, "mu_auto": true, "mu_auto_rho": 0.1, "mu_auto_eps": 1e-08, "sample_eta": 0.5, "sample_dwce_scale": 1.0, "topk": 1, "candidate_pairs": [ 21 ], "trainable_params": 17694720, "pair_selection": { "num_pairs": 26, "excluded_pairs": [ 0, 1, 25 ], "candidate_pairs": [ 21 ], "total_samples": 6, "unique_pairs": 1, "counts": [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0 ], "freqs": [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ], "probs": [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ], "top_pairs": [ { "pair": 21, "count": 6, "freq": 1.0, "prob": 1.0 } ] }, "avg_loss": 0.0004060502259892, "avg_anchor": 0.0002513490423249702, "avg_interaction": 0.09605961292982101, "avg_mu": 0.0015758845697112923, "teacher_source": "previous_cycle_memory", "teacher_cycle": 5, "dwce_scores_pre": [ Infinity, Infinity, 0.10885138210341537, 0.09962797171315212, 0.09704690759872063, 0.10032807292174113, 0.07477073061165104, 0.06816870958942334, 0.08890422275158102, 0.1163330007130214, 0.14856900172908535, 0.15104770110436222, 0.15976559604817797, 0.16747390251227062, 0.17331518344021776, 0.14107504898742165, 0.12868654620236872, 0.13613157978454246, 0.1218191001657377, 0.10902217379118222, 0.08936219242296228, 0.060528135887943944, 0.07539605212866492, 0.09116655794274726, 0.11185780392379165, Infinity ], "post_ppl": { "wikitext:wikitext-2-raw-v1": 18.54056249689346 }, "post_selection_recomputed": false, "selected_layer_post": 21 }, "distill": { "enabled": true, "method": "reparam", "calib_samples": 256, "inst_samples": 0, "seq_len": 1024, "batch_size": 1, "epochs": 1.0, "lr": 0.0001, "kl_weight": 0.02, "kl_temp": 4.0, "hidden_mse_weight": 1.0, "attn_mse_weight": 0.25, "mlp_mse_weight": 1.0, "reparam_eta": 0.0, "reparam_gamma": 0.0, "reparam_attn_reg_scale": 1.0, "reparam_mlp_reg_scale": 1.0, "reparam_param_subset": "mlp", "reparam_stats": { "enabled": true, "epochs": 1.0, "lr": 0.0001, "hidden_mse_weight": 1.0, "attn_mse_weight": 0.25, "mlp_mse_weight": 1.0, "eta": 0.0, "gamma": 0.0, "attn_reg_scale": 1.0, "mlp_reg_scale": 1.0, "param_subset": "mlp", "num_gates": 3, "num_attn_gates": 0, "num_mlp_gates": 3, "num_other_gates": 0, "lambda_init": "fisher_prior" }, "reparam_gate_summary": { "num_tensors": 3, "num_elements": 176160768, "global_mean": 0.48417738505772184, "per_tensor_mean": { "mlp.gate_proj.weight": 0.4960911273956299, "mlp.up_proj.weight": 0.4809410572052002, "mlp.down_proj.weight": 0.47550004720687866 } }, "post_ppl": { "wikitext:wikitext-2-raw-v1": 19.929857094583117 }, "weight_decay": 0.0, "max_grad_norm": 1.0, "grad_accum_steps": 1, "instruction_dataset": null, "instruction_config": null, "instruction_split": "train" }, "lora": { "enabled": false, "seq_len": 1024, "batch_size": 1, "epochs": 0.0, "rank": 8, "alpha": 16.0, "dropout": 0.0, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj" ], "respect_exclude_pairs": false, "kl_enabled": false, "kl_weight": 0.1, "kl_temp": 4.0, "post_ppl": null, "lr": 0.0001, "weight_decay": 0.0, "max_grad_norm": 1.0, "grad_accum_steps": 1, "log_steps": 100, "eval_every": 2000, "eval_max_batches": null }, "norm_policy": "hybrid", "full_model_saved": true, "full_model": { "stage": "cycle_6_full_model", "path": "llama3_mlp_skipdwce_nopermute/cycle_6/full_model", "weight_bytes": 13460487442, "post_ppl": { "wikitext:wikitext-2-raw-v1": 19.929857094583117 }, "resume_info": "resume_info.json" } }