Invalid JSON: Unexpected token 'I', ..."s": [
Infinity,
"... is not valid JSON
| { | |
| "cycle": 11, | |
| "layer_merged": 16, | |
| "num_layers_before": 22, | |
| "num_layers_after": 21, | |
| "fused_layer_state": "fused_layer.pt", | |
| "dwce_score": 0.04662290344732959, | |
| "dwce_scores": [ | |
| Infinity, | |
| Infinity, | |
| 0.08755428350373622, | |
| 0.09632033766022144, | |
| 0.12102187652892513, | |
| 0.07687722047289398, | |
| 0.07646200702022135, | |
| 0.10762881995392454, | |
| 0.11769039904744791, | |
| 0.13281737352618223, | |
| 0.13359121989702824, | |
| 0.12213174133312057, | |
| 0.09479378554179792, | |
| 0.0702238861787414, | |
| 0.09607953229105111, | |
| 0.1022260119464429, | |
| 0.04662290344732959, | |
| 0.08633913939177026, | |
| 0.14327522026540693, | |
| 0.14370887781314598, | |
| Infinity | |
| ], | |
| "dwce_meta": { | |
| "per_pair": [ | |
| { | |
| "excluded": true | |
| }, | |
| { | |
| "excluded": true | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5475512621731512, | |
| "self_attn.k_proj.weight": 0.391473276857112, | |
| "self_attn.v_proj.weight": 0.4909271140725403, | |
| "self_attn.o_proj.weight": 0.5059257993820573, | |
| "mlp.gate_proj.weight": 0.5142934415231778, | |
| "mlp.up_proj.weight": 0.564209682386205, | |
| "mlp.down_proj.weight": 0.5882046140979224, | |
| "input_layernorm.weight": 0.08591573704035245, | |
| "post_attention_layernorm.weight": 0.6916345704649807 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.42295415260881286, | |
| "self_attn.k_proj.weight": 0.6006598545932649, | |
| "self_attn.v_proj.weight": 0.5839130619828803, | |
| "self_attn.o_proj.weight": 0.3061736529359485, | |
| "mlp.gate_proj.weight": 0.20992059002641766, | |
| "mlp.up_proj.weight": 0.22624838112491824, | |
| "mlp.down_proj.weight": 0.23975704132014206, | |
| "input_layernorm.weight": 0.9392704038111166, | |
| "post_attention_layernorm.weight": 0.534603342461067 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.3869070567163679, | |
| "self_attn.k_proj.weight": 0.41733788169115643, | |
| "self_attn.v_proj.weight": 0.6819478620941586, | |
| "self_attn.o_proj.weight": 0.5185018964772606, | |
| "mlp.gate_proj.weight": 0.3857952544575103, | |
| "mlp.up_proj.weight": 0.4188764368887535, | |
| "mlp.down_proj.weight": 0.43345911595128295, | |
| "input_layernorm.weight": 0.44958074027472317, | |
| "post_attention_layernorm.weight": 0.47017550272738196 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.41141851886171904, | |
| "self_attn.k_proj.weight": 0.3273305916332069, | |
| "self_attn.v_proj.weight": 0.5340342356611809, | |
| "self_attn.o_proj.weight": 0.479517156619146, | |
| "mlp.gate_proj.weight": 0.5126061476043234, | |
| "mlp.up_proj.weight": 0.6602585594995453, | |
| "mlp.down_proj.weight": 0.6384829779670611, | |
| "input_layernorm.weight": 0.4515625478959966, | |
| "post_attention_layernorm.weight": 0.31991898737374685 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.458330232710177, | |
| "self_attn.k_proj.weight": 0.5289200124472092, | |
| "self_attn.v_proj.weight": 0.5946075042986118, | |
| "self_attn.o_proj.weight": 0.4489154493805055, | |
| "mlp.gate_proj.weight": 0.22742880739738114, | |
| "mlp.up_proj.weight": 0.26307015782535986, | |
| "mlp.down_proj.weight": 0.3063196382255173, | |
| "input_layernorm.weight": 0.9061901333647325, | |
| "post_attention_layernorm.weight": 0.7022436233626503 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.35959892592956205, | |
| "self_attn.k_proj.weight": 0.3443886350766098, | |
| "self_attn.v_proj.weight": 0.5720661391629458, | |
| "self_attn.o_proj.weight": 0.5818776664595493, | |
| "mlp.gate_proj.weight": 0.39978735416382805, | |
| "mlp.up_proj.weight": 0.417749042840734, | |
| "mlp.down_proj.weight": 0.4387652280731668, | |
| "input_layernorm.weight": 0.38406718131548756, | |
| "post_attention_layernorm.weight": 0.6422503049460635 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.636542229290557, | |
| "self_attn.k_proj.weight": 0.614517102690962, | |
| "self_attn.v_proj.weight": 0.46813187109223736, | |
| "self_attn.o_proj.weight": 0.6041227476356936, | |
| "mlp.gate_proj.weight": 0.40712627903024656, | |
| "mlp.up_proj.weight": 0.5043431807036443, | |
| "mlp.down_proj.weight": 0.4505876614150708, | |
| "input_layernorm.weight": 0.6192326922748669, | |
| "post_attention_layernorm.weight": 0.6265746279680466 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.34925453885360336, | |
| "self_attn.k_proj.weight": 0.35278771152971217, | |
| "self_attn.v_proj.weight": 0.5525661476709113, | |
| "self_attn.o_proj.weight": 0.5383430454657737, | |
| "mlp.gate_proj.weight": 0.334643068005848, | |
| "mlp.up_proj.weight": 0.4327895515323508, | |
| "mlp.down_proj.weight": 0.4350180162313817, | |
| "input_layernorm.weight": 0.7771682873929657, | |
| "post_attention_layernorm.weight": 0.5724920885861154 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5426133527809779, | |
| "self_attn.k_proj.weight": 0.5917253556706261, | |
| "self_attn.v_proj.weight": 0.5803218647047271, | |
| "self_attn.o_proj.weight": 0.573619331665695, | |
| "mlp.gate_proj.weight": 0.304521619972936, | |
| "mlp.up_proj.weight": 0.38239116602834844, | |
| "mlp.down_proj.weight": 0.4294228175148867, | |
| "input_layernorm.weight": 0.41347205318383795, | |
| "post_attention_layernorm.weight": 0.4822200725964279 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.30216724988405375, | |
| "self_attn.k_proj.weight": 0.37973968358516746, | |
| "self_attn.v_proj.weight": 0.5855414716136844, | |
| "self_attn.o_proj.weight": 0.5522292406340652, | |
| "mlp.gate_proj.weight": 0.3154624609232963, | |
| "mlp.up_proj.weight": 0.4254449945154834, | |
| "mlp.down_proj.weight": 0.4676055055246573, | |
| "input_layernorm.weight": 0.6319572678591321, | |
| "post_attention_layernorm.weight": 0.49370291402388483 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.2122552427048082, | |
| "self_attn.k_proj.weight": 0.24749176650690055, | |
| "self_attn.v_proj.weight": 0.4473200679907213, | |
| "self_attn.o_proj.weight": 0.43028525139201423, | |
| "mlp.gate_proj.weight": 0.30637712121093663, | |
| "mlp.up_proj.weight": 0.3902341181994015, | |
| "mlp.down_proj.weight": 0.4368393972885733, | |
| "input_layernorm.weight": 0.2106297820601434, | |
| "post_attention_layernorm.weight": 0.5594815557912957 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.3306107310264498, | |
| "self_attn.k_proj.weight": 0.3770506265620634, | |
| "self_attn.v_proj.weight": 0.5762362986478698, | |
| "self_attn.o_proj.weight": 0.4960079169523212, | |
| "mlp.gate_proj.weight": 0.2982419943154518, | |
| "mlp.up_proj.weight": 0.37781123577136094, | |
| "mlp.down_proj.weight": 0.4191678549276368, | |
| "input_layernorm.weight": 0.38140716100377725, | |
| "post_attention_layernorm.weight": 0.49019038978320156 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.09993601246267203, | |
| "self_attn.k_proj.weight": 0.12521449073239727, | |
| "self_attn.v_proj.weight": 0.4907779577602679, | |
| "self_attn.o_proj.weight": 0.42575401537526514, | |
| "mlp.gate_proj.weight": 0.30881117685577564, | |
| "mlp.up_proj.weight": 0.36044257082955694, | |
| "mlp.down_proj.weight": 0.41282750868942547, | |
| "input_layernorm.weight": 0.5088983795968438, | |
| "post_attention_layernorm.weight": 0.4496861231516873 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.6602980044411406, | |
| "self_attn.k_proj.weight": 0.6420444265398998, | |
| "self_attn.v_proj.weight": 0.5316983977800627, | |
| "self_attn.o_proj.weight": 0.42227379180088404, | |
| "mlp.gate_proj.weight": 0.3108678287915571, | |
| "mlp.up_proj.weight": 0.4041657299501923, | |
| "mlp.down_proj.weight": 0.43265705494920426, | |
| "input_layernorm.weight": 0.878605184021867, | |
| "post_attention_layernorm.weight": 0.5910226949526707 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.5, | |
| "self_attn.k_proj.weight": 0.2599271947445671, | |
| "self_attn.v_proj.weight": 0.4975028664708022, | |
| "self_attn.o_proj.weight": 0.5421607676915916, | |
| "mlp.gate_proj.weight": 0.3056325201378473, | |
| "mlp.up_proj.weight": 0.43990546921978413, | |
| "mlp.down_proj.weight": 0.43404790428518036, | |
| "input_layernorm.weight": 0.4213949074080623, | |
| "post_attention_layernorm.weight": 0.09070741198253394 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.05946946364281485, | |
| "self_attn.k_proj.weight": 0.05838806857596083, | |
| "self_attn.v_proj.weight": 0.3147092170971509, | |
| "self_attn.o_proj.weight": 0.17507343342847417, | |
| "mlp.gate_proj.weight": 0.20693857128512452, | |
| "mlp.up_proj.weight": 0.17476325022355804, | |
| "mlp.down_proj.weight": 0.21355502956692002, | |
| "input_layernorm.weight": 0.049311151072639454, | |
| "post_attention_layernorm.weight": 0.8488525701289025 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.008338692721690102, | |
| "self_attn.k_proj.weight": 0.29374913202825975, | |
| "self_attn.v_proj.weight": 0.6461261275459721, | |
| "self_attn.o_proj.weight": 0.4819790617006713, | |
| "mlp.gate_proj.weight": 0.47106954949105545, | |
| "mlp.up_proj.weight": 0.45120924747821234, | |
| "mlp.down_proj.weight": 0.49902002059029626, | |
| "input_layernorm.weight": 0.5308434897842321, | |
| "post_attention_layernorm.weight": 0.33720525286504943 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "num_batches": 64, | |
| "token_count": 131072.0, | |
| "norm": "relative", | |
| "supports_kwargs": true, | |
| "fuse_priors": { | |
| "self_attn.q_proj.weight": 0.9961265181629947, | |
| "self_attn.k_proj.weight": 0.6652207081963631, | |
| "self_attn.v_proj.weight": 0.6393890973156684, | |
| "self_attn.o_proj.weight": 0.5993613584307386, | |
| "mlp.gate_proj.weight": 0.3655389632232587, | |
| "mlp.up_proj.weight": 0.5096844599903007, | |
| "mlp.down_proj.weight": 0.4675655218682354, | |
| "input_layernorm.weight": 0.919265488845913, | |
| "post_attention_layernorm.weight": 0.21712004107502894 | |
| }, | |
| "metric": "dwce" | |
| }, | |
| { | |
| "excluded": true | |
| } | |
| ], | |
| "supports_kwargs": true, | |
| "max_batches": 0, | |
| "norm": "relative", | |
| "metric": "dwce", | |
| "cosine_topk": 3, | |
| "start_index": 0, | |
| "excluded_pairs": [ | |
| 0, | |
| 1, | |
| 20 | |
| ] | |
| }, | |
| "fisher_num_batches": 64, | |
| "merge_method": "reparam", | |
| "merged_params": 3, | |
| "num_sequences": 128, | |
| "teacher_source": "previous_cycle_memory", | |
| "teacher_cycle": 10, | |
| "eval": { | |
| "datasets": [ | |
| "wikitext" | |
| ], | |
| "configs": [ | |
| "wikitext-2-raw-v1" | |
| ], | |
| "split": "test", | |
| "num_samples": 200, | |
| "seq_len": 1024, | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 38.01892516739481 | |
| } | |
| }, | |
| "comm": { | |
| "enabled": true, | |
| "train_mode": "lora", | |
| "opt_steps": 6, | |
| "grad_accum_steps": 1, | |
| "lr": 1e-05, | |
| "temp": 2.0, | |
| "steps_ratio": 0.1, | |
| "lr_scale": 0.1, | |
| "interaction_mode": "relative", | |
| "interaction_eps": 1e-08, | |
| "mu": 0.5, | |
| "mu_auto": true, | |
| "mu_auto_rho": 0.1, | |
| "mu_auto_eps": 1e-08, | |
| "sample_eta": 0.5, | |
| "sample_dwce_scale": 1.0, | |
| "topk": 1, | |
| "candidate_pairs": [ | |
| 16 | |
| ], | |
| "trainable_params": 14417920, | |
| "pair_selection": { | |
| "num_pairs": 21, | |
| "excluded_pairs": [ | |
| 0, | |
| 1, | |
| 20 | |
| ], | |
| "candidate_pairs": [ | |
| 16 | |
| ], | |
| "total_samples": 6, | |
| "unique_pairs": 1, | |
| "counts": [ | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 0, | |
| 6, | |
| 0, | |
| 0, | |
| 0, | |
| 0 | |
| ], | |
| "freqs": [ | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 1.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0 | |
| ], | |
| "probs": [ | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 1.0, | |
| 0.0, | |
| 0.0, | |
| 0.0, | |
| 0.0 | |
| ], | |
| "top_pairs": [ | |
| { | |
| "pair": 16, | |
| "count": 6, | |
| "freq": 1.0, | |
| "prob": 1.0 | |
| } | |
| ] | |
| }, | |
| "avg_loss": 0.0004086560936388632, | |
| "avg_anchor": 0.00023934131847151244, | |
| "avg_interaction": 0.0413870836297671, | |
| "avg_mu": 0.0037876796102222893, | |
| "teacher_source": "previous_cycle_memory", | |
| "teacher_cycle": 10, | |
| "dwce_scores_pre": [ | |
| Infinity, | |
| Infinity, | |
| 0.08755428350373622, | |
| 0.09632033766022144, | |
| 0.12102187652892513, | |
| 0.07687722047289398, | |
| 0.07646200702022135, | |
| 0.10762881995392454, | |
| 0.11769039904744791, | |
| 0.13281737352618223, | |
| 0.13359121989702824, | |
| 0.12213174133312057, | |
| 0.09479378554179792, | |
| 0.0702238861787414, | |
| 0.09607953229105111, | |
| 0.1022260119464429, | |
| 0.04662290344732959, | |
| 0.08633913939177026, | |
| 0.14327522026540693, | |
| 0.14370887781314598, | |
| Infinity | |
| ], | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 34.96986894820446 | |
| }, | |
| "post_selection_recomputed": false, | |
| "selected_layer_post": 16 | |
| }, | |
| "distill": { | |
| "enabled": true, | |
| "method": "reparam", | |
| "calib_samples": 256, | |
| "inst_samples": 0, | |
| "seq_len": 1024, | |
| "batch_size": 1, | |
| "epochs": 1.0, | |
| "lr": 0.0001, | |
| "kl_weight": 0.02, | |
| "kl_temp": 4.0, | |
| "hidden_mse_weight": 1.0, | |
| "attn_mse_weight": 0.25, | |
| "mlp_mse_weight": 1.0, | |
| "reparam_eta": 0.0, | |
| "reparam_gamma": 0.0, | |
| "reparam_attn_reg_scale": 1.0, | |
| "reparam_mlp_reg_scale": 1.0, | |
| "reparam_param_subset": "mlp", | |
| "reparam_stats": { | |
| "enabled": true, | |
| "epochs": 1.0, | |
| "lr": 0.0001, | |
| "hidden_mse_weight": 1.0, | |
| "attn_mse_weight": 0.25, | |
| "mlp_mse_weight": 1.0, | |
| "eta": 0.0, | |
| "gamma": 0.0, | |
| "attn_reg_scale": 1.0, | |
| "mlp_reg_scale": 1.0, | |
| "param_subset": "mlp", | |
| "num_gates": 3, | |
| "num_attn_gates": 0, | |
| "num_mlp_gates": 3, | |
| "num_other_gates": 0, | |
| "lambda_init": "fisher_prior" | |
| }, | |
| "reparam_gate_summary": { | |
| "num_tensors": 3, | |
| "num_elements": 176160768, | |
| "global_mean": 0.5134112267267137, | |
| "per_tensor_mean": { | |
| "mlp.gate_proj.weight": 0.5045510530471802, | |
| "mlp.up_proj.weight": 0.5129446983337402, | |
| "mlp.down_proj.weight": 0.522737979888916 | |
| } | |
| }, | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 38.01892516739481 | |
| }, | |
| "weight_decay": 0.0, | |
| "max_grad_norm": 1.0, | |
| "grad_accum_steps": 1, | |
| "instruction_dataset": null, | |
| "instruction_config": null, | |
| "instruction_split": "train" | |
| }, | |
| "lora": { | |
| "enabled": false, | |
| "seq_len": 1024, | |
| "batch_size": 1, | |
| "epochs": 0.0, | |
| "rank": 8, | |
| "alpha": 16.0, | |
| "dropout": 0.0, | |
| "target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "gate_proj", | |
| "down_proj", | |
| "up_proj" | |
| ], | |
| "respect_exclude_pairs": false, | |
| "kl_enabled": false, | |
| "kl_weight": 0.1, | |
| "kl_temp": 4.0, | |
| "post_ppl": null, | |
| "lr": 0.0001, | |
| "weight_decay": 0.0, | |
| "max_grad_norm": 1.0, | |
| "grad_accum_steps": 1, | |
| "log_steps": 100, | |
| "eval_every": 2000, | |
| "eval_max_batches": null | |
| }, | |
| "norm_policy": "hybrid", | |
| "full_model_saved": true, | |
| "full_model": { | |
| "stage": "cycle_11_full_model", | |
| "path": "llama3_mlp_skipdwce_nopermute/cycle_11/full_model", | |
| "weight_bytes": 11279358471, | |
| "post_ppl": { | |
| "wikitext:wikitext-2-raw-v1": 38.01892516739481 | |
| }, | |
| "resume_info": "resume_info.json" | |
| } | |
| } |