temp_ss / 8b_11skip /cycle_metadata.json
LJYAI's picture
upload
ec45673 verified
Invalid JSON: Unexpected token 'I', ..."s": [ Infinity, "... is not valid JSON
{
"cycle": 11,
"layer_merged": 16,
"num_layers_before": 22,
"num_layers_after": 21,
"fused_layer_state": "fused_layer.pt",
"dwce_score": 0.04662290344732959,
"dwce_scores": [
Infinity,
Infinity,
0.08755428350373622,
0.09632033766022144,
0.12102187652892513,
0.07687722047289398,
0.07646200702022135,
0.10762881995392454,
0.11769039904744791,
0.13281737352618223,
0.13359121989702824,
0.12213174133312057,
0.09479378554179792,
0.0702238861787414,
0.09607953229105111,
0.1022260119464429,
0.04662290344732959,
0.08633913939177026,
0.14327522026540693,
0.14370887781314598,
Infinity
],
"dwce_meta": {
"per_pair": [
{
"excluded": true
},
{
"excluded": true
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5475512621731512,
"self_attn.k_proj.weight": 0.391473276857112,
"self_attn.v_proj.weight": 0.4909271140725403,
"self_attn.o_proj.weight": 0.5059257993820573,
"mlp.gate_proj.weight": 0.5142934415231778,
"mlp.up_proj.weight": 0.564209682386205,
"mlp.down_proj.weight": 0.5882046140979224,
"input_layernorm.weight": 0.08591573704035245,
"post_attention_layernorm.weight": 0.6916345704649807
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.42295415260881286,
"self_attn.k_proj.weight": 0.6006598545932649,
"self_attn.v_proj.weight": 0.5839130619828803,
"self_attn.o_proj.weight": 0.3061736529359485,
"mlp.gate_proj.weight": 0.20992059002641766,
"mlp.up_proj.weight": 0.22624838112491824,
"mlp.down_proj.weight": 0.23975704132014206,
"input_layernorm.weight": 0.9392704038111166,
"post_attention_layernorm.weight": 0.534603342461067
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.3869070567163679,
"self_attn.k_proj.weight": 0.41733788169115643,
"self_attn.v_proj.weight": 0.6819478620941586,
"self_attn.o_proj.weight": 0.5185018964772606,
"mlp.gate_proj.weight": 0.3857952544575103,
"mlp.up_proj.weight": 0.4188764368887535,
"mlp.down_proj.weight": 0.43345911595128295,
"input_layernorm.weight": 0.44958074027472317,
"post_attention_layernorm.weight": 0.47017550272738196
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.41141851886171904,
"self_attn.k_proj.weight": 0.3273305916332069,
"self_attn.v_proj.weight": 0.5340342356611809,
"self_attn.o_proj.weight": 0.479517156619146,
"mlp.gate_proj.weight": 0.5126061476043234,
"mlp.up_proj.weight": 0.6602585594995453,
"mlp.down_proj.weight": 0.6384829779670611,
"input_layernorm.weight": 0.4515625478959966,
"post_attention_layernorm.weight": 0.31991898737374685
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.458330232710177,
"self_attn.k_proj.weight": 0.5289200124472092,
"self_attn.v_proj.weight": 0.5946075042986118,
"self_attn.o_proj.weight": 0.4489154493805055,
"mlp.gate_proj.weight": 0.22742880739738114,
"mlp.up_proj.weight": 0.26307015782535986,
"mlp.down_proj.weight": 0.3063196382255173,
"input_layernorm.weight": 0.9061901333647325,
"post_attention_layernorm.weight": 0.7022436233626503
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.35959892592956205,
"self_attn.k_proj.weight": 0.3443886350766098,
"self_attn.v_proj.weight": 0.5720661391629458,
"self_attn.o_proj.weight": 0.5818776664595493,
"mlp.gate_proj.weight": 0.39978735416382805,
"mlp.up_proj.weight": 0.417749042840734,
"mlp.down_proj.weight": 0.4387652280731668,
"input_layernorm.weight": 0.38406718131548756,
"post_attention_layernorm.weight": 0.6422503049460635
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.636542229290557,
"self_attn.k_proj.weight": 0.614517102690962,
"self_attn.v_proj.weight": 0.46813187109223736,
"self_attn.o_proj.weight": 0.6041227476356936,
"mlp.gate_proj.weight": 0.40712627903024656,
"mlp.up_proj.weight": 0.5043431807036443,
"mlp.down_proj.weight": 0.4505876614150708,
"input_layernorm.weight": 0.6192326922748669,
"post_attention_layernorm.weight": 0.6265746279680466
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.34925453885360336,
"self_attn.k_proj.weight": 0.35278771152971217,
"self_attn.v_proj.weight": 0.5525661476709113,
"self_attn.o_proj.weight": 0.5383430454657737,
"mlp.gate_proj.weight": 0.334643068005848,
"mlp.up_proj.weight": 0.4327895515323508,
"mlp.down_proj.weight": 0.4350180162313817,
"input_layernorm.weight": 0.7771682873929657,
"post_attention_layernorm.weight": 0.5724920885861154
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5426133527809779,
"self_attn.k_proj.weight": 0.5917253556706261,
"self_attn.v_proj.weight": 0.5803218647047271,
"self_attn.o_proj.weight": 0.573619331665695,
"mlp.gate_proj.weight": 0.304521619972936,
"mlp.up_proj.weight": 0.38239116602834844,
"mlp.down_proj.weight": 0.4294228175148867,
"input_layernorm.weight": 0.41347205318383795,
"post_attention_layernorm.weight": 0.4822200725964279
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.30216724988405375,
"self_attn.k_proj.weight": 0.37973968358516746,
"self_attn.v_proj.weight": 0.5855414716136844,
"self_attn.o_proj.weight": 0.5522292406340652,
"mlp.gate_proj.weight": 0.3154624609232963,
"mlp.up_proj.weight": 0.4254449945154834,
"mlp.down_proj.weight": 0.4676055055246573,
"input_layernorm.weight": 0.6319572678591321,
"post_attention_layernorm.weight": 0.49370291402388483
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.2122552427048082,
"self_attn.k_proj.weight": 0.24749176650690055,
"self_attn.v_proj.weight": 0.4473200679907213,
"self_attn.o_proj.weight": 0.43028525139201423,
"mlp.gate_proj.weight": 0.30637712121093663,
"mlp.up_proj.weight": 0.3902341181994015,
"mlp.down_proj.weight": 0.4368393972885733,
"input_layernorm.weight": 0.2106297820601434,
"post_attention_layernorm.weight": 0.5594815557912957
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.3306107310264498,
"self_attn.k_proj.weight": 0.3770506265620634,
"self_attn.v_proj.weight": 0.5762362986478698,
"self_attn.o_proj.weight": 0.4960079169523212,
"mlp.gate_proj.weight": 0.2982419943154518,
"mlp.up_proj.weight": 0.37781123577136094,
"mlp.down_proj.weight": 0.4191678549276368,
"input_layernorm.weight": 0.38140716100377725,
"post_attention_layernorm.weight": 0.49019038978320156
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.09993601246267203,
"self_attn.k_proj.weight": 0.12521449073239727,
"self_attn.v_proj.weight": 0.4907779577602679,
"self_attn.o_proj.weight": 0.42575401537526514,
"mlp.gate_proj.weight": 0.30881117685577564,
"mlp.up_proj.weight": 0.36044257082955694,
"mlp.down_proj.weight": 0.41282750868942547,
"input_layernorm.weight": 0.5088983795968438,
"post_attention_layernorm.weight": 0.4496861231516873
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.6602980044411406,
"self_attn.k_proj.weight": 0.6420444265398998,
"self_attn.v_proj.weight": 0.5316983977800627,
"self_attn.o_proj.weight": 0.42227379180088404,
"mlp.gate_proj.weight": 0.3108678287915571,
"mlp.up_proj.weight": 0.4041657299501923,
"mlp.down_proj.weight": 0.43265705494920426,
"input_layernorm.weight": 0.878605184021867,
"post_attention_layernorm.weight": 0.5910226949526707
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.5,
"self_attn.k_proj.weight": 0.2599271947445671,
"self_attn.v_proj.weight": 0.4975028664708022,
"self_attn.o_proj.weight": 0.5421607676915916,
"mlp.gate_proj.weight": 0.3056325201378473,
"mlp.up_proj.weight": 0.43990546921978413,
"mlp.down_proj.weight": 0.43404790428518036,
"input_layernorm.weight": 0.4213949074080623,
"post_attention_layernorm.weight": 0.09070741198253394
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.05946946364281485,
"self_attn.k_proj.weight": 0.05838806857596083,
"self_attn.v_proj.weight": 0.3147092170971509,
"self_attn.o_proj.weight": 0.17507343342847417,
"mlp.gate_proj.weight": 0.20693857128512452,
"mlp.up_proj.weight": 0.17476325022355804,
"mlp.down_proj.weight": 0.21355502956692002,
"input_layernorm.weight": 0.049311151072639454,
"post_attention_layernorm.weight": 0.8488525701289025
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.008338692721690102,
"self_attn.k_proj.weight": 0.29374913202825975,
"self_attn.v_proj.weight": 0.6461261275459721,
"self_attn.o_proj.weight": 0.4819790617006713,
"mlp.gate_proj.weight": 0.47106954949105545,
"mlp.up_proj.weight": 0.45120924747821234,
"mlp.down_proj.weight": 0.49902002059029626,
"input_layernorm.weight": 0.5308434897842321,
"post_attention_layernorm.weight": 0.33720525286504943
},
"metric": "dwce"
},
{
"num_batches": 64,
"token_count": 131072.0,
"norm": "relative",
"supports_kwargs": true,
"fuse_priors": {
"self_attn.q_proj.weight": 0.9961265181629947,
"self_attn.k_proj.weight": 0.6652207081963631,
"self_attn.v_proj.weight": 0.6393890973156684,
"self_attn.o_proj.weight": 0.5993613584307386,
"mlp.gate_proj.weight": 0.3655389632232587,
"mlp.up_proj.weight": 0.5096844599903007,
"mlp.down_proj.weight": 0.4675655218682354,
"input_layernorm.weight": 0.919265488845913,
"post_attention_layernorm.weight": 0.21712004107502894
},
"metric": "dwce"
},
{
"excluded": true
}
],
"supports_kwargs": true,
"max_batches": 0,
"norm": "relative",
"metric": "dwce",
"cosine_topk": 3,
"start_index": 0,
"excluded_pairs": [
0,
1,
20
]
},
"fisher_num_batches": 64,
"merge_method": "reparam",
"merged_params": 3,
"num_sequences": 128,
"teacher_source": "previous_cycle_memory",
"teacher_cycle": 10,
"eval": {
"datasets": [
"wikitext"
],
"configs": [
"wikitext-2-raw-v1"
],
"split": "test",
"num_samples": 200,
"seq_len": 1024,
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 38.01892516739481
}
},
"comm": {
"enabled": true,
"train_mode": "lora",
"opt_steps": 6,
"grad_accum_steps": 1,
"lr": 1e-05,
"temp": 2.0,
"steps_ratio": 0.1,
"lr_scale": 0.1,
"interaction_mode": "relative",
"interaction_eps": 1e-08,
"mu": 0.5,
"mu_auto": true,
"mu_auto_rho": 0.1,
"mu_auto_eps": 1e-08,
"sample_eta": 0.5,
"sample_dwce_scale": 1.0,
"topk": 1,
"candidate_pairs": [
16
],
"trainable_params": 14417920,
"pair_selection": {
"num_pairs": 21,
"excluded_pairs": [
0,
1,
20
],
"candidate_pairs": [
16
],
"total_samples": 6,
"unique_pairs": 1,
"counts": [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
6,
0,
0,
0,
0
],
"freqs": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
1.0,
0.0,
0.0,
0.0,
0.0
],
"probs": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
1.0,
0.0,
0.0,
0.0,
0.0
],
"top_pairs": [
{
"pair": 16,
"count": 6,
"freq": 1.0,
"prob": 1.0
}
]
},
"avg_loss": 0.0004086560936388632,
"avg_anchor": 0.00023934131847151244,
"avg_interaction": 0.0413870836297671,
"avg_mu": 0.0037876796102222893,
"teacher_source": "previous_cycle_memory",
"teacher_cycle": 10,
"dwce_scores_pre": [
Infinity,
Infinity,
0.08755428350373622,
0.09632033766022144,
0.12102187652892513,
0.07687722047289398,
0.07646200702022135,
0.10762881995392454,
0.11769039904744791,
0.13281737352618223,
0.13359121989702824,
0.12213174133312057,
0.09479378554179792,
0.0702238861787414,
0.09607953229105111,
0.1022260119464429,
0.04662290344732959,
0.08633913939177026,
0.14327522026540693,
0.14370887781314598,
Infinity
],
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 34.96986894820446
},
"post_selection_recomputed": false,
"selected_layer_post": 16
},
"distill": {
"enabled": true,
"method": "reparam",
"calib_samples": 256,
"inst_samples": 0,
"seq_len": 1024,
"batch_size": 1,
"epochs": 1.0,
"lr": 0.0001,
"kl_weight": 0.02,
"kl_temp": 4.0,
"hidden_mse_weight": 1.0,
"attn_mse_weight": 0.25,
"mlp_mse_weight": 1.0,
"reparam_eta": 0.0,
"reparam_gamma": 0.0,
"reparam_attn_reg_scale": 1.0,
"reparam_mlp_reg_scale": 1.0,
"reparam_param_subset": "mlp",
"reparam_stats": {
"enabled": true,
"epochs": 1.0,
"lr": 0.0001,
"hidden_mse_weight": 1.0,
"attn_mse_weight": 0.25,
"mlp_mse_weight": 1.0,
"eta": 0.0,
"gamma": 0.0,
"attn_reg_scale": 1.0,
"mlp_reg_scale": 1.0,
"param_subset": "mlp",
"num_gates": 3,
"num_attn_gates": 0,
"num_mlp_gates": 3,
"num_other_gates": 0,
"lambda_init": "fisher_prior"
},
"reparam_gate_summary": {
"num_tensors": 3,
"num_elements": 176160768,
"global_mean": 0.5134112267267137,
"per_tensor_mean": {
"mlp.gate_proj.weight": 0.5045510530471802,
"mlp.up_proj.weight": 0.5129446983337402,
"mlp.down_proj.weight": 0.522737979888916
}
},
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 38.01892516739481
},
"weight_decay": 0.0,
"max_grad_norm": 1.0,
"grad_accum_steps": 1,
"instruction_dataset": null,
"instruction_config": null,
"instruction_split": "train"
},
"lora": {
"enabled": false,
"seq_len": 1024,
"batch_size": 1,
"epochs": 0.0,
"rank": 8,
"alpha": 16.0,
"dropout": 0.0,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
"respect_exclude_pairs": false,
"kl_enabled": false,
"kl_weight": 0.1,
"kl_temp": 4.0,
"post_ppl": null,
"lr": 0.0001,
"weight_decay": 0.0,
"max_grad_norm": 1.0,
"grad_accum_steps": 1,
"log_steps": 100,
"eval_every": 2000,
"eval_max_batches": null
},
"norm_policy": "hybrid",
"full_model_saved": true,
"full_model": {
"stage": "cycle_11_full_model",
"path": "llama3_mlp_skipdwce_nopermute/cycle_11/full_model",
"weight_bytes": 11279358471,
"post_ppl": {
"wikitext:wikitext-2-raw-v1": 38.01892516739481
},
"resume_info": "resume_info.json"
}
}