{ "cycle": 11, "layer_merged": 16, "num_layers_before": 22, "num_layers_after": 21, "fused_layer_state": "fused_layer.pt", "dwce_score": 0.04662290344732959, "dwce_scores": [ Infinity, Infinity, 0.08755428350373622, 0.09632033766022144, 0.12102187652892513, 0.07687722047289398, 0.07646200702022135, 0.10762881995392454, 0.11769039904744791, 0.13281737352618223, 0.13359121989702824, 0.12213174133312057, 0.09479378554179792, 0.0702238861787414, 0.09607953229105111, 0.1022260119464429, 0.04662290344732959, 0.08633913939177026, 0.14327522026540693, 0.14370887781314598, Infinity ], "dwce_meta": { "per_pair": [ { "excluded": true }, { "excluded": true }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5475512621731512, "self_attn.k_proj.weight": 0.391473276857112, "self_attn.v_proj.weight": 0.4909271140725403, "self_attn.o_proj.weight": 0.5059257993820573, "mlp.gate_proj.weight": 0.5142934415231778, "mlp.up_proj.weight": 0.564209682386205, "mlp.down_proj.weight": 0.5882046140979224, "input_layernorm.weight": 0.08591573704035245, "post_attention_layernorm.weight": 0.6916345704649807 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.42295415260881286, "self_attn.k_proj.weight": 0.6006598545932649, "self_attn.v_proj.weight": 0.5839130619828803, "self_attn.o_proj.weight": 0.3061736529359485, "mlp.gate_proj.weight": 0.20992059002641766, "mlp.up_proj.weight": 0.22624838112491824, "mlp.down_proj.weight": 0.23975704132014206, "input_layernorm.weight": 0.9392704038111166, "post_attention_layernorm.weight": 0.534603342461067 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.3869070567163679, "self_attn.k_proj.weight": 0.41733788169115643, "self_attn.v_proj.weight": 0.6819478620941586, "self_attn.o_proj.weight": 0.5185018964772606, "mlp.gate_proj.weight": 0.3857952544575103, "mlp.up_proj.weight": 0.4188764368887535, "mlp.down_proj.weight": 0.43345911595128295, "input_layernorm.weight": 0.44958074027472317, "post_attention_layernorm.weight": 0.47017550272738196 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.41141851886171904, "self_attn.k_proj.weight": 0.3273305916332069, "self_attn.v_proj.weight": 0.5340342356611809, "self_attn.o_proj.weight": 0.479517156619146, "mlp.gate_proj.weight": 0.5126061476043234, "mlp.up_proj.weight": 0.6602585594995453, "mlp.down_proj.weight": 0.6384829779670611, "input_layernorm.weight": 0.4515625478959966, "post_attention_layernorm.weight": 0.31991898737374685 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.458330232710177, "self_attn.k_proj.weight": 0.5289200124472092, "self_attn.v_proj.weight": 0.5946075042986118, "self_attn.o_proj.weight": 0.4489154493805055, "mlp.gate_proj.weight": 0.22742880739738114, "mlp.up_proj.weight": 0.26307015782535986, "mlp.down_proj.weight": 0.3063196382255173, "input_layernorm.weight": 0.9061901333647325, "post_attention_layernorm.weight": 0.7022436233626503 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.35959892592956205, "self_attn.k_proj.weight": 0.3443886350766098, "self_attn.v_proj.weight": 0.5720661391629458, "self_attn.o_proj.weight": 0.5818776664595493, "mlp.gate_proj.weight": 0.39978735416382805, "mlp.up_proj.weight": 0.417749042840734, "mlp.down_proj.weight": 0.4387652280731668, "input_layernorm.weight": 0.38406718131548756, "post_attention_layernorm.weight": 0.6422503049460635 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.636542229290557, "self_attn.k_proj.weight": 0.614517102690962, "self_attn.v_proj.weight": 0.46813187109223736, "self_attn.o_proj.weight": 0.6041227476356936, "mlp.gate_proj.weight": 0.40712627903024656, "mlp.up_proj.weight": 0.5043431807036443, "mlp.down_proj.weight": 0.4505876614150708, "input_layernorm.weight": 0.6192326922748669, "post_attention_layernorm.weight": 0.6265746279680466 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.34925453885360336, "self_attn.k_proj.weight": 0.35278771152971217, "self_attn.v_proj.weight": 0.5525661476709113, "self_attn.o_proj.weight": 0.5383430454657737, "mlp.gate_proj.weight": 0.334643068005848, "mlp.up_proj.weight": 0.4327895515323508, "mlp.down_proj.weight": 0.4350180162313817, "input_layernorm.weight": 0.7771682873929657, "post_attention_layernorm.weight": 0.5724920885861154 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5426133527809779, "self_attn.k_proj.weight": 0.5917253556706261, "self_attn.v_proj.weight": 0.5803218647047271, "self_attn.o_proj.weight": 0.573619331665695, "mlp.gate_proj.weight": 0.304521619972936, "mlp.up_proj.weight": 0.38239116602834844, "mlp.down_proj.weight": 0.4294228175148867, "input_layernorm.weight": 0.41347205318383795, "post_attention_layernorm.weight": 0.4822200725964279 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.30216724988405375, "self_attn.k_proj.weight": 0.37973968358516746, "self_attn.v_proj.weight": 0.5855414716136844, "self_attn.o_proj.weight": 0.5522292406340652, "mlp.gate_proj.weight": 0.3154624609232963, "mlp.up_proj.weight": 0.4254449945154834, "mlp.down_proj.weight": 0.4676055055246573, "input_layernorm.weight": 0.6319572678591321, "post_attention_layernorm.weight": 0.49370291402388483 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.2122552427048082, "self_attn.k_proj.weight": 0.24749176650690055, "self_attn.v_proj.weight": 0.4473200679907213, "self_attn.o_proj.weight": 0.43028525139201423, "mlp.gate_proj.weight": 0.30637712121093663, "mlp.up_proj.weight": 0.3902341181994015, "mlp.down_proj.weight": 0.4368393972885733, "input_layernorm.weight": 0.2106297820601434, "post_attention_layernorm.weight": 0.5594815557912957 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.3306107310264498, "self_attn.k_proj.weight": 0.3770506265620634, "self_attn.v_proj.weight": 0.5762362986478698, "self_attn.o_proj.weight": 0.4960079169523212, "mlp.gate_proj.weight": 0.2982419943154518, "mlp.up_proj.weight": 0.37781123577136094, "mlp.down_proj.weight": 0.4191678549276368, "input_layernorm.weight": 0.38140716100377725, "post_attention_layernorm.weight": 0.49019038978320156 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.09993601246267203, "self_attn.k_proj.weight": 0.12521449073239727, "self_attn.v_proj.weight": 0.4907779577602679, "self_attn.o_proj.weight": 0.42575401537526514, "mlp.gate_proj.weight": 0.30881117685577564, "mlp.up_proj.weight": 0.36044257082955694, "mlp.down_proj.weight": 0.41282750868942547, "input_layernorm.weight": 0.5088983795968438, "post_attention_layernorm.weight": 0.4496861231516873 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.6602980044411406, "self_attn.k_proj.weight": 0.6420444265398998, "self_attn.v_proj.weight": 0.5316983977800627, "self_attn.o_proj.weight": 0.42227379180088404, "mlp.gate_proj.weight": 0.3108678287915571, "mlp.up_proj.weight": 0.4041657299501923, "mlp.down_proj.weight": 0.43265705494920426, "input_layernorm.weight": 0.878605184021867, "post_attention_layernorm.weight": 0.5910226949526707 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.5, "self_attn.k_proj.weight": 0.2599271947445671, "self_attn.v_proj.weight": 0.4975028664708022, "self_attn.o_proj.weight": 0.5421607676915916, "mlp.gate_proj.weight": 0.3056325201378473, "mlp.up_proj.weight": 0.43990546921978413, "mlp.down_proj.weight": 0.43404790428518036, "input_layernorm.weight": 0.4213949074080623, "post_attention_layernorm.weight": 0.09070741198253394 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.05946946364281485, "self_attn.k_proj.weight": 0.05838806857596083, "self_attn.v_proj.weight": 0.3147092170971509, "self_attn.o_proj.weight": 0.17507343342847417, "mlp.gate_proj.weight": 0.20693857128512452, "mlp.up_proj.weight": 0.17476325022355804, "mlp.down_proj.weight": 0.21355502956692002, "input_layernorm.weight": 0.049311151072639454, "post_attention_layernorm.weight": 0.8488525701289025 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.008338692721690102, "self_attn.k_proj.weight": 0.29374913202825975, "self_attn.v_proj.weight": 0.6461261275459721, "self_attn.o_proj.weight": 0.4819790617006713, "mlp.gate_proj.weight": 0.47106954949105545, "mlp.up_proj.weight": 0.45120924747821234, "mlp.down_proj.weight": 0.49902002059029626, "input_layernorm.weight": 0.5308434897842321, "post_attention_layernorm.weight": 0.33720525286504943 }, "metric": "dwce" }, { "num_batches": 64, "token_count": 131072.0, "norm": "relative", "supports_kwargs": true, "fuse_priors": { "self_attn.q_proj.weight": 0.9961265181629947, "self_attn.k_proj.weight": 0.6652207081963631, "self_attn.v_proj.weight": 0.6393890973156684, "self_attn.o_proj.weight": 0.5993613584307386, "mlp.gate_proj.weight": 0.3655389632232587, "mlp.up_proj.weight": 0.5096844599903007, "mlp.down_proj.weight": 0.4675655218682354, "input_layernorm.weight": 0.919265488845913, "post_attention_layernorm.weight": 0.21712004107502894 }, "metric": "dwce" }, { "excluded": true } ], "supports_kwargs": true, "max_batches": 0, "norm": "relative", "metric": "dwce", "cosine_topk": 3, "start_index": 0, "excluded_pairs": [ 0, 1, 20 ] }, "fisher_num_batches": 64, "merge_method": "reparam", "merged_params": 3, "num_sequences": 128, "teacher_source": "previous_cycle_memory", "teacher_cycle": 10, "eval": { "datasets": [ "wikitext" ], "configs": [ "wikitext-2-raw-v1" ], "split": "test", "num_samples": 200, "seq_len": 1024, "post_ppl": { "wikitext:wikitext-2-raw-v1": 38.01892516739481 } }, "comm": { "enabled": true, "train_mode": "lora", "opt_steps": 6, "grad_accum_steps": 1, "lr": 1e-05, "temp": 2.0, "steps_ratio": 0.1, "lr_scale": 0.1, "interaction_mode": "relative", "interaction_eps": 1e-08, "mu": 0.5, "mu_auto": true, "mu_auto_rho": 0.1, "mu_auto_eps": 1e-08, "sample_eta": 0.5, "sample_dwce_scale": 1.0, "topk": 1, "candidate_pairs": [ 16 ], "trainable_params": 14417920, "pair_selection": { "num_pairs": 21, "excluded_pairs": [ 0, 1, 20 ], "candidate_pairs": [ 16 ], "total_samples": 6, "unique_pairs": 1, "counts": [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0 ], "freqs": [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ], "probs": [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ], "top_pairs": [ { "pair": 16, "count": 6, "freq": 1.0, "prob": 1.0 } ] }, "avg_loss": 0.0004086560936388632, "avg_anchor": 0.00023934131847151244, "avg_interaction": 0.0413870836297671, "avg_mu": 0.0037876796102222893, "teacher_source": "previous_cycle_memory", "teacher_cycle": 10, "dwce_scores_pre": [ Infinity, Infinity, 0.08755428350373622, 0.09632033766022144, 0.12102187652892513, 0.07687722047289398, 0.07646200702022135, 0.10762881995392454, 0.11769039904744791, 0.13281737352618223, 0.13359121989702824, 0.12213174133312057, 0.09479378554179792, 0.0702238861787414, 0.09607953229105111, 0.1022260119464429, 0.04662290344732959, 0.08633913939177026, 0.14327522026540693, 0.14370887781314598, Infinity ], "post_ppl": { "wikitext:wikitext-2-raw-v1": 34.96986894820446 }, "post_selection_recomputed": false, "selected_layer_post": 16 }, "distill": { "enabled": true, "method": "reparam", "calib_samples": 256, "inst_samples": 0, "seq_len": 1024, "batch_size": 1, "epochs": 1.0, "lr": 0.0001, "kl_weight": 0.02, "kl_temp": 4.0, "hidden_mse_weight": 1.0, "attn_mse_weight": 0.25, "mlp_mse_weight": 1.0, "reparam_eta": 0.0, "reparam_gamma": 0.0, "reparam_attn_reg_scale": 1.0, "reparam_mlp_reg_scale": 1.0, "reparam_param_subset": "mlp", "reparam_stats": { "enabled": true, "epochs": 1.0, "lr": 0.0001, "hidden_mse_weight": 1.0, "attn_mse_weight": 0.25, "mlp_mse_weight": 1.0, "eta": 0.0, "gamma": 0.0, "attn_reg_scale": 1.0, "mlp_reg_scale": 1.0, "param_subset": "mlp", "num_gates": 3, "num_attn_gates": 0, "num_mlp_gates": 3, "num_other_gates": 0, "lambda_init": "fisher_prior" }, "reparam_gate_summary": { "num_tensors": 3, "num_elements": 176160768, "global_mean": 0.5134112267267137, "per_tensor_mean": { "mlp.gate_proj.weight": 0.5045510530471802, "mlp.up_proj.weight": 0.5129446983337402, "mlp.down_proj.weight": 0.522737979888916 } }, "post_ppl": { "wikitext:wikitext-2-raw-v1": 38.01892516739481 }, "weight_decay": 0.0, "max_grad_norm": 1.0, "grad_accum_steps": 1, "instruction_dataset": null, "instruction_config": null, "instruction_split": "train" }, "lora": { "enabled": false, "seq_len": 1024, "batch_size": 1, "epochs": 0.0, "rank": 8, "alpha": 16.0, "dropout": 0.0, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj" ], "respect_exclude_pairs": false, "kl_enabled": false, "kl_weight": 0.1, "kl_temp": 4.0, "post_ppl": null, "lr": 0.0001, "weight_decay": 0.0, "max_grad_norm": 1.0, "grad_accum_steps": 1, "log_steps": 100, "eval_every": 2000, "eval_max_batches": null }, "norm_policy": "hybrid", "full_model_saved": true, "full_model": { "stage": "cycle_11_full_model", "path": "llama3_mlp_skipdwce_nopermute/cycle_11/full_model", "weight_bytes": 11279358471, "post_ppl": { "wikitext:wikitext-2-raw-v1": 38.01892516739481 }, "resume_info": "resume_info.json" } }