temp_ss / cycle_6 /cycle_metadata.json
LJYAI's picture
abprune experiment
98b9392 verified
Invalid JSON: Unexpected token 'I', ..."s": [ Infinity, "... is not valid JSON
{
"cycle": 6,
"layer_merged": 21,
"num_layers_before": 27,
"num_layers_after": 26,
"fused_layer_state": "fused_layer.pt",
"dwce_score": 0.060528135887943944,
"dwce_scores": [
Infinity,
Infinity,
0.10885138210341537,
0.09962797171315212,
0.09704690759872063,
0.10032807292174113,
0.07477073061165104,
0.06816870958942334,
0.08890422275158102,
0.1163330007130214,
0.14856900172908535,
0.15104770110436222,
0.15976559604817797,
0.16747390251227062,
0.17331518344021776,
0.14107504898742165,
0.12868654620236872,
0.13613157978454246,
0.1218191001657377,
0.10902217379118222,
0.08936219242296228,
0.060528135887943944,
0.07539605212866492,
0.09116655794274726,
0.11185780392379165,
Infinity
],
"dwce_meta": {
"per_pair": [
{
"excluded": true
},
{
"excluded": true
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.45602849053786343,
"self_attn.k_proj.weight": 0.393914402513139,
"self_attn.v_proj.weight": 0.540396339894555,
"self_attn.o_proj.weight": 0.4469669933138352,
"mlp.gate_proj.weight": 0.3287149601078028,
"mlp.up_proj.weight": 0.35017769317939573,
"mlp.down_proj.weight": 0.396783937778306,
"input_layernorm.weight": 0.17714660120924552,
"post_attention_layernorm.weight": 0.5706843420160465
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.2956975268685422,
"self_attn.k_proj.weight": 0.3772577526025228,
"self_attn.v_proj.weight": 0.5824193735130013,
"self_attn.o_proj.weight": 0.44336875810997856,
"mlp.gate_proj.weight": 0.30769135298804196,
"mlp.up_proj.weight": 0.35382330315519916,
"mlp.down_proj.weight": 0.37185043779817395,
"input_layernorm.weight": 0.7024250477029108,
"post_attention_layernorm.weight": 0.5121384967983699
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.30319082595336366,
"self_attn.k_proj.weight": 0.3492474196377044,
"self_attn.v_proj.weight": 0.569890311972998,
"self_attn.o_proj.weight": 0.40464328058154025,
"mlp.gate_proj.weight": 0.28532597693591105,
"mlp.up_proj.weight": 0.37282024323447943,
"mlp.down_proj.weight": 0.3841916217498055,
"input_layernorm.weight": 0.4863317106902496,
"post_attention_layernorm.weight": 0.57501372396618
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.30239571422605777,
"self_attn.k_proj.weight": 0.28777982633775234,
"self_attn.v_proj.weight": 0.5214361921442343,
"self_attn.o_proj.weight": 0.43105923012955555,
"mlp.gate_proj.weight": 0.2730870548532783,
"mlp.up_proj.weight": 0.3623753277725939,
"mlp.down_proj.weight": 0.38708954857683764,
"input_layernorm.weight": 0.4643158587476409,
"post_attention_layernorm.weight": 0.4712457550840951
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.34829974358469984,
"self_attn.k_proj.weight": 0.3124058890492968,
"self_attn.v_proj.weight": 0.44195795428146895,
"self_attn.o_proj.weight": 0.4143407830271052,
"mlp.gate_proj.weight": 0.5,
"mlp.up_proj.weight": 0.5509473359135564,
"mlp.down_proj.weight": 0.5172651880216964,
"input_layernorm.weight": 0.6178273084347614,
"post_attention_layernorm.weight": 0.34427816773879094
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.2730859710153346,
"self_attn.k_proj.weight": 0.3061184297688129,
"self_attn.v_proj.weight": 0.533020418936679,
"self_attn.o_proj.weight": 0.4119345004856529,
"mlp.gate_proj.weight": 0.5,
"mlp.up_proj.weight": 0.13066309879562818,
"mlp.down_proj.weight": 0.2330601154961996,
"input_layernorm.weight": 0.8059542094784746,
"post_attention_layernorm.weight": 0.6625755866545766
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.48768939039675996,
"self_attn.k_proj.weight": 0.4777739962444542,
"self_attn.v_proj.weight": 0.5103851366152496,
"self_attn.o_proj.weight": 0.43250945492739656,
"mlp.gate_proj.weight": 0.3257707165607638,
"mlp.up_proj.weight": 0.4096843334038478,
"mlp.down_proj.weight": 0.34885861400447404,
"input_layernorm.weight": 0.4200429536840167,
"post_attention_layernorm.weight": 0.5367871634336786
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.29755787633442426,
"self_attn.k_proj.weight": 0.37623755403743336,
"self_attn.v_proj.weight": 0.6136683116996197,
"self_attn.o_proj.weight": 0.5741847831544276,
"mlp.gate_proj.weight": 0.27139084405979624,
"mlp.up_proj.weight": 0.352502726534821,
"mlp.down_proj.weight": 0.3594937163598074,
"input_layernorm.weight": 0.5244406580066927,
"post_attention_layernorm.weight": 0.5383591174280284
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.48784027196921265,
"self_attn.k_proj.weight": 0.39846873275053135,
"self_attn.v_proj.weight": 0.4618083281504365,
"self_attn.o_proj.weight": 0.5396497221340433,
"mlp.gate_proj.weight": 0.28731097517799203,
"mlp.up_proj.weight": 0.37777335079811836,
"mlp.down_proj.weight": 0.38422114872976587,
"input_layernorm.weight": 0.5041621307381597,
"post_attention_layernorm.weight": 0.5670639954113798
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.23827875940660836,
"self_attn.k_proj.weight": 0.3238894105610408,
"self_attn.v_proj.weight": 0.5690090563700821,
"self_attn.o_proj.weight": 0.5450013202392769,
"mlp.gate_proj.weight": 0.27271401584431,
"mlp.up_proj.weight": 0.3708806850281214,
"mlp.down_proj.weight": 0.39500743275208433,
"input_layernorm.weight": 0.6987078362557021,
"post_attention_layernorm.weight": 0.5374434799165528
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.3946032688367209,
"self_attn.k_proj.weight": 0.40690493743203565,
"self_attn.v_proj.weight": 0.5164469910119862,
"self_attn.o_proj.weight": 0.4597598293804492,
"mlp.gate_proj.weight": 0.269573396995867,
"mlp.up_proj.weight": 0.35366279000533957,
"mlp.down_proj.weight": 0.3877383926763193,
"input_layernorm.weight": 0.5460628958641058,
"post_attention_layernorm.weight": 0.5071179303686918
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.3008175631152415,
"self_attn.v_proj.weight": 0.5493743908332103,
"self_attn.o_proj.weight": 0.4661606538187534,
"mlp.gate_proj.weight": 0.2783245620057637,
"mlp.up_proj.weight": 0.36798475292532096,
"mlp.down_proj.weight": 0.41990780468185046,
"input_layernorm.weight": 0.3526699737302493,
"post_attention_layernorm.weight": 0.5026090703676459
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.5,
"self_attn.v_proj.weight": 0.5046790280321507,
"self_attn.o_proj.weight": 0.47862372917098306,
"mlp.gate_proj.weight": 0.28429170701639556,
"mlp.up_proj.weight": 0.3568216404807901,
"mlp.down_proj.weight": 0.4024126479370246,
"input_layernorm.weight": 0.6964693773965374,
"post_attention_layernorm.weight": 0.5389671586153284
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.5,
"self_attn.v_proj.weight": 0.5711961200240154,
"self_attn.o_proj.weight": 0.4091136757892634,
"mlp.gate_proj.weight": 0.28165197173200957,
"mlp.up_proj.weight": 0.3558940378823637,
"mlp.down_proj.weight": 0.3944025471751477,
"input_layernorm.weight": 0.34554249375842755,
"post_attention_layernorm.weight": 0.5083749806126059
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.5,
"self_attn.v_proj.weight": 0.4798904930258487,
"self_attn.o_proj.weight": 0.3796993330110259,
"mlp.gate_proj.weight": 0.2834371362960443,
"mlp.up_proj.weight": 0.34598115404819363,
"mlp.down_proj.weight": 0.3810985843525749,
"input_layernorm.weight": 0.40754399908659844,
"post_attention_layernorm.weight": 0.497314728281144
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.5,
"self_attn.v_proj.weight": 0.5081409141199547,
"self_attn.o_proj.weight": 0.3629102582375656,
"mlp.gate_proj.weight": 0.2855739089361027,
"mlp.up_proj.weight": 0.35880372082662226,
"mlp.down_proj.weight": 0.38944742416779293,
"input_layernorm.weight": 0.6608006181001739,
"post_attention_layernorm.weight": 0.5296959070502865
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.5,
"self_attn.v_proj.weight": 0.49572269146465,
"self_attn.o_proj.weight": 0.32994298892262397,
"mlp.gate_proj.weight": 0.27769206189679957,
"mlp.up_proj.weight": 0.3455361882658386,
"mlp.down_proj.weight": 0.3735752249423331,
"input_layernorm.weight": 0.4170270576790789,
"post_attention_layernorm.weight": 0.5209363007605031
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.5,
"self_attn.v_proj.weight": 0.4867919706742694,
"self_attn.o_proj.weight": 0.3538593097308571,
"mlp.gate_proj.weight": 0.27117298636599874,
"mlp.up_proj.weight": 0.33462896686497223,
"mlp.down_proj.weight": 0.3612770373233193,
"input_layernorm.weight": 0.6936974577548146,
"post_attention_layernorm.weight": 0.5001178185439411
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.09349704918366467,
"self_attn.k_proj.weight": 0.5,
"self_attn.v_proj.weight": 0.47712484653665943,
"self_attn.o_proj.weight": 0.29886608310554436,
"mlp.gate_proj.weight": 0.2693949444903617,
"mlp.up_proj.weight": 0.3291873676707558,
"mlp.down_proj.weight": 0.35004469298030644,
"input_layernorm.weight": 0.33369843125590304,
"post_attention_layernorm.weight": 0.5011505559153778
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.40387717352354646,
"self_attn.k_proj.weight": 0.32231430163408453,
"self_attn.v_proj.weight": 0.42154930635885873,
"self_attn.o_proj.weight": 0.2721077912952483,
"mlp.gate_proj.weight": 0.5,
"mlp.up_proj.weight": 0.40582628271960014,
"mlp.down_proj.weight": 0.3570913409001458,
"input_layernorm.weight": 0.5122671690645244,
"post_attention_layernorm.weight": 0.49147814328550515
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.1520219921182753,
"self_attn.k_proj.weight": 0.13721084571035752,
"self_attn.v_proj.weight": 0.40551897093604916,
"self_attn.o_proj.weight": 0.280649542041515,
"mlp.gate_proj.weight": 0.5,
"mlp.up_proj.weight": 0.14720318423611445,
"mlp.down_proj.weight": 0.17429299506277318,
"input_layernorm.weight": 0.09607062192796464,
"post_attention_layernorm.weight": 0.44690356138952886
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.007074450011806872,
"self_attn.k_proj.weight": 0.1615631246455786,
"self_attn.v_proj.weight": 0.5877874751289588,
"self_attn.o_proj.weight": 0.3278764246216598,
"mlp.gate_proj.weight": 0.313893134211228,
"mlp.up_proj.weight": 0.3528312375210034,
"mlp.down_proj.weight": 0.35640966437698607,
"input_layernorm.weight": 0.6657340014175253,
"post_attention_layernorm.weight": 0.39247642201648
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.9892781643373502,
"self_attn.k_proj.weight": 0.657748170777267,
"self_attn.v_proj.weight": 0.4684578019872222,
"self_attn.o_proj.weight": 0.4193140384970065,
"mlp.gate_proj.weight": 0.5,
"mlp.up_proj.weight": 0.3405968802707512,
"mlp.down_proj.weight": 0.3304433709433969,
"input_layernorm.weight": 0.6733460076353963,
"post_attention_layernorm.weight": 0.15409983400561383
},
"metric": "dwce"
},
{
"excluded": true
}
],
"supports_kwargs": true,
"max_batches": 0,
"norm": "relative",
"metric": "dwce",
"cosine_topk": 3,
"start_index": 0,
"excluded_pairs": [
0,
1,
25
]
},
"fisher_num_batches": 64,
"merge_method": "reparam",
"merged_params": 3,
"num_sequences": 128,
"teacher_source": "previous_cycle_memory",
"teacher_cycle": 5,
"eval": {
"datasets": [
"wikitext"
],
"configs": [
"wikitext-2-raw-v1"
],
"split": "test",
"num_samples": 200,
"seq_len": 1024,
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 19.929857094583117
}
},
"comm": {
"enabled": true,
"train_mode": "lora",
"opt_steps": 6,
"grad_accum_steps": 1,
"lr": 1e-05,
"temp": 2.0,
"steps_ratio": 0.1,
"lr_scale": 0.1,
"interaction_mode": "relative",
"interaction_eps": 1e-08,
"mu": 0.5,
"mu_auto": true,
"mu_auto_rho": 0.1,
"mu_auto_eps": 1e-08,
"sample_eta": 0.5,
"sample_dwce_scale": 1.0,
"topk": 1,
"candidate_pairs": [
21
],
"trainable_params": 17694720,
"pair_selection": {
"num_pairs": 26,
"excluded_pairs": [
0,
1,
25
],
"candidate_pairs": [
21
],
"total_samples": 6,
"unique_pairs": 1,
"counts": [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
6,
0,
0,
0,
0
],
"freqs": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
1.0,
0.0,
0.0,
0.0,
0.0
],
"probs": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
1.0,
0.0,
0.0,
0.0,
0.0
],
"top_pairs": [
{
"pair": 21,
"count": 6,
"freq": 1.0,
"prob": 1.0
}
]
},
"avg_loss": 0.0004060502259892,
"avg_anchor": 0.0002513490423249702,
"avg_interaction": 0.09605961292982101,
"avg_mu": 0.0015758845697112923,
"teacher_source": "previous_cycle_memory",
"teacher_cycle": 5,
"dwce_scores_pre": [
Infinity,
Infinity,
0.10885138210341537,
0.09962797171315212,
0.09704690759872063,
0.10032807292174113,
0.07477073061165104,
0.06816870958942334,
0.08890422275158102,
0.1163330007130214,
0.14856900172908535,
0.15104770110436222,
0.15976559604817797,
0.16747390251227062,
0.17331518344021776,
0.14107504898742165,
0.12868654620236872,
0.13613157978454246,
0.1218191001657377,
0.10902217379118222,
0.08936219242296228,
0.060528135887943944,
0.07539605212866492,
0.09116655794274726,
0.11185780392379165,
Infinity
],
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 18.54056249689346
},
"post_selection_recomputed": false,
"selected_layer_post": 21
},
"distill": {
"enabled": true,
"method": "reparam",
"calib_samples": 256,
"inst_samples": 0,
"seq_len": 1024,
"batch_size": 1,
"epochs": 1.0,
"lr": 0.0001,
"kl_weight": 0.02,
"kl_temp": 4.0,
"hidden_mse_weight": 1.0,
"attn_mse_weight": 0.25,
"mlp_mse_weight": 1.0,
"reparam_eta": 0.0,
"reparam_gamma": 0.0,
"reparam_attn_reg_scale": 1.0,
"reparam_mlp_reg_scale": 1.0,
"reparam_param_subset": "mlp",
"reparam_stats": {
"enabled": true,
"epochs": 1.0,
"lr": 0.0001,
"hidden_mse_weight": 1.0,
"attn_mse_weight": 0.25,
"mlp_mse_weight": 1.0,
"eta": 0.0,
"gamma": 0.0,
"attn_reg_scale": 1.0,
"mlp_reg_scale": 1.0,
"param_subset": "mlp",
"num_gates": 3,
"num_attn_gates": 0,
"num_mlp_gates": 3,
"num_other_gates": 0,
"lambda_init": "fisher_prior"
},
"reparam_gate_summary": {
"num_tensors": 3,
"num_elements": 176160768,
"global_mean": 0.48417738505772184,
"per_tensor_mean": {
"mlp.gate_proj.weight": 0.4960911273956299,
"mlp.up_proj.weight": 0.4809410572052002,
"mlp.down_proj.weight": 0.47550004720687866
}
},
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 19.929857094583117
},
"weight_decay": 0.0,
"max_grad_norm": 1.0,
"grad_accum_steps": 1,
"instruction_dataset": null,
"instruction_config": null,
"instruction_split": "train"
},
"lora": {
"enabled": false,
"seq_len": 1024,
"batch_size": 1,
"epochs": 0.0,
"rank": 8,
"alpha": 16.0,
"dropout": 0.0,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"respect_exclude_pairs": false,
"kl_enabled": false,
"kl_weight": 0.1,
"kl_temp": 4.0,
"post_ppl": null,
"lr": 0.0001,
"weight_decay": 0.0,
"max_grad_norm": 1.0,
"grad_accum_steps": 1,
"log_steps": 100,
"eval_every": 2000,
"eval_max_batches": null
},
"norm_policy": "hybrid",
"full_model_saved": true,
"full_model": {
"stage": "cycle_6_full_model",
"path": "llama3_mlp_skipdwce_nopermute/cycle_6/full_model",
"weight_bytes": 13460487442,
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 19.929857094583117
},
"resume_info": "resume_info.json"
}
}