temp_ss / 8b_11skip /cycle_metadata.json

upload

ec45673 verified 24 days ago

19.3 kB

Invalid JSON: Unexpected token 'I', ..."s": [ Infinity, "... is not valid JSON

	{
	"cycle": 11,
	"layer_merged": 16,
	"num_layers_before": 22,
	"num_layers_after": 21,
	"fused_layer_state": "fused_layer.pt",
	"dwce_score": 0.04662290344732959,
	"dwce_scores": [
	Infinity,
	Infinity,
	0.08755428350373622,
	0.09632033766022144,
	0.12102187652892513,
	0.07687722047289398,
	0.07646200702022135,
	0.10762881995392454,
	0.11769039904744791,
	0.13281737352618223,
	0.13359121989702824,
	0.12213174133312057,
	0.09479378554179792,
	0.0702238861787414,
	0.09607953229105111,
	0.1022260119464429,
	0.04662290344732959,
	0.08633913939177026,
	0.14327522026540693,
	0.14370887781314598,
	Infinity
	],
	"dwce_meta": {
	"per_pair": [
	{
	"excluded": true
	},
	{
	"excluded": true
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.5475512621731512,
	"self_attn.k_proj.weight": 0.391473276857112,
	"self_attn.v_proj.weight": 0.4909271140725403,
	"self_attn.o_proj.weight": 0.5059257993820573,
	"mlp.gate_proj.weight": 0.5142934415231778,
	"mlp.up_proj.weight": 0.564209682386205,
	"mlp.down_proj.weight": 0.5882046140979224,
	"input_layernorm.weight": 0.08591573704035245,
	"post_attention_layernorm.weight": 0.6916345704649807
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.42295415260881286,
	"self_attn.k_proj.weight": 0.6006598545932649,
	"self_attn.v_proj.weight": 0.5839130619828803,
	"self_attn.o_proj.weight": 0.3061736529359485,
	"mlp.gate_proj.weight": 0.20992059002641766,
	"mlp.up_proj.weight": 0.22624838112491824,
	"mlp.down_proj.weight": 0.23975704132014206,
	"input_layernorm.weight": 0.9392704038111166,
	"post_attention_layernorm.weight": 0.534603342461067
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.3869070567163679,
	"self_attn.k_proj.weight": 0.41733788169115643,
	"self_attn.v_proj.weight": 0.6819478620941586,
	"self_attn.o_proj.weight": 0.5185018964772606,
	"mlp.gate_proj.weight": 0.3857952544575103,
	"mlp.up_proj.weight": 0.4188764368887535,
	"mlp.down_proj.weight": 0.43345911595128295,
	"input_layernorm.weight": 0.44958074027472317,
	"post_attention_layernorm.weight": 0.47017550272738196
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.41141851886171904,
	"self_attn.k_proj.weight": 0.3273305916332069,
	"self_attn.v_proj.weight": 0.5340342356611809,
	"self_attn.o_proj.weight": 0.479517156619146,
	"mlp.gate_proj.weight": 0.5126061476043234,
	"mlp.up_proj.weight": 0.6602585594995453,
	"mlp.down_proj.weight": 0.6384829779670611,
	"input_layernorm.weight": 0.4515625478959966,
	"post_attention_layernorm.weight": 0.31991898737374685
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.458330232710177,
	"self_attn.k_proj.weight": 0.5289200124472092,
	"self_attn.v_proj.weight": 0.5946075042986118,
	"self_attn.o_proj.weight": 0.4489154493805055,
	"mlp.gate_proj.weight": 0.22742880739738114,
	"mlp.up_proj.weight": 0.26307015782535986,
	"mlp.down_proj.weight": 0.3063196382255173,
	"input_layernorm.weight": 0.9061901333647325,
	"post_attention_layernorm.weight": 0.7022436233626503
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.35959892592956205,
	"self_attn.k_proj.weight": 0.3443886350766098,
	"self_attn.v_proj.weight": 0.5720661391629458,
	"self_attn.o_proj.weight": 0.5818776664595493,
	"mlp.gate_proj.weight": 0.39978735416382805,
	"mlp.up_proj.weight": 0.417749042840734,
	"mlp.down_proj.weight": 0.4387652280731668,
	"input_layernorm.weight": 0.38406718131548756,
	"post_attention_layernorm.weight": 0.6422503049460635
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.636542229290557,
	"self_attn.k_proj.weight": 0.614517102690962,
	"self_attn.v_proj.weight": 0.46813187109223736,
	"self_attn.o_proj.weight": 0.6041227476356936,
	"mlp.gate_proj.weight": 0.40712627903024656,
	"mlp.up_proj.weight": 0.5043431807036443,
	"mlp.down_proj.weight": 0.4505876614150708,
	"input_layernorm.weight": 0.6192326922748669,
	"post_attention_layernorm.weight": 0.6265746279680466
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.34925453885360336,
	"self_attn.k_proj.weight": 0.35278771152971217,
	"self_attn.v_proj.weight": 0.5525661476709113,
	"self_attn.o_proj.weight": 0.5383430454657737,
	"mlp.gate_proj.weight": 0.334643068005848,
	"mlp.up_proj.weight": 0.4327895515323508,
	"mlp.down_proj.weight": 0.4350180162313817,
	"input_layernorm.weight": 0.7771682873929657,
	"post_attention_layernorm.weight": 0.5724920885861154
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.5426133527809779,
	"self_attn.k_proj.weight": 0.5917253556706261,
	"self_attn.v_proj.weight": 0.5803218647047271,
	"self_attn.o_proj.weight": 0.573619331665695,
	"mlp.gate_proj.weight": 0.304521619972936,
	"mlp.up_proj.weight": 0.38239116602834844,
	"mlp.down_proj.weight": 0.4294228175148867,
	"input_layernorm.weight": 0.41347205318383795,
	"post_attention_layernorm.weight": 0.4822200725964279
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.30216724988405375,
	"self_attn.k_proj.weight": 0.37973968358516746,
	"self_attn.v_proj.weight": 0.5855414716136844,
	"self_attn.o_proj.weight": 0.5522292406340652,
	"mlp.gate_proj.weight": 0.3154624609232963,
	"mlp.up_proj.weight": 0.4254449945154834,
	"mlp.down_proj.weight": 0.4676055055246573,
	"input_layernorm.weight": 0.6319572678591321,
	"post_attention_layernorm.weight": 0.49370291402388483
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.2122552427048082,
	"self_attn.k_proj.weight": 0.24749176650690055,
	"self_attn.v_proj.weight": 0.4473200679907213,
	"self_attn.o_proj.weight": 0.43028525139201423,
	"mlp.gate_proj.weight": 0.30637712121093663,
	"mlp.up_proj.weight": 0.3902341181994015,
	"mlp.down_proj.weight": 0.4368393972885733,
	"input_layernorm.weight": 0.2106297820601434,
	"post_attention_layernorm.weight": 0.5594815557912957
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.3306107310264498,
	"self_attn.k_proj.weight": 0.3770506265620634,
	"self_attn.v_proj.weight": 0.5762362986478698,
	"self_attn.o_proj.weight": 0.4960079169523212,
	"mlp.gate_proj.weight": 0.2982419943154518,
	"mlp.up_proj.weight": 0.37781123577136094,
	"mlp.down_proj.weight": 0.4191678549276368,
	"input_layernorm.weight": 0.38140716100377725,
	"post_attention_layernorm.weight": 0.49019038978320156
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.09993601246267203,
	"self_attn.k_proj.weight": 0.12521449073239727,
	"self_attn.v_proj.weight": 0.4907779577602679,
	"self_attn.o_proj.weight": 0.42575401537526514,
	"mlp.gate_proj.weight": 0.30881117685577564,
	"mlp.up_proj.weight": 0.36044257082955694,
	"mlp.down_proj.weight": 0.41282750868942547,
	"input_layernorm.weight": 0.5088983795968438,
	"post_attention_layernorm.weight": 0.4496861231516873
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.6602980044411406,
	"self_attn.k_proj.weight": 0.6420444265398998,
	"self_attn.v_proj.weight": 0.5316983977800627,
	"self_attn.o_proj.weight": 0.42227379180088404,
	"mlp.gate_proj.weight": 0.3108678287915571,
	"mlp.up_proj.weight": 0.4041657299501923,
	"mlp.down_proj.weight": 0.43265705494920426,
	"input_layernorm.weight": 0.878605184021867,
	"post_attention_layernorm.weight": 0.5910226949526707
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.5,
	"self_attn.k_proj.weight": 0.2599271947445671,
	"self_attn.v_proj.weight": 0.4975028664708022,
	"self_attn.o_proj.weight": 0.5421607676915916,
	"mlp.gate_proj.weight": 0.3056325201378473,
	"mlp.up_proj.weight": 0.43990546921978413,
	"mlp.down_proj.weight": 0.43404790428518036,
	"input_layernorm.weight": 0.4213949074080623,
	"post_attention_layernorm.weight": 0.09070741198253394
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.05946946364281485,
	"self_attn.k_proj.weight": 0.05838806857596083,
	"self_attn.v_proj.weight": 0.3147092170971509,
	"self_attn.o_proj.weight": 0.17507343342847417,
	"mlp.gate_proj.weight": 0.20693857128512452,
	"mlp.up_proj.weight": 0.17476325022355804,
	"mlp.down_proj.weight": 0.21355502956692002,
	"input_layernorm.weight": 0.049311151072639454,
	"post_attention_layernorm.weight": 0.8488525701289025
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.008338692721690102,
	"self_attn.k_proj.weight": 0.29374913202825975,
	"self_attn.v_proj.weight": 0.6461261275459721,
	"self_attn.o_proj.weight": 0.4819790617006713,
	"mlp.gate_proj.weight": 0.47106954949105545,
	"mlp.up_proj.weight": 0.45120924747821234,
	"mlp.down_proj.weight": 0.49902002059029626,
	"input_layernorm.weight": 0.5308434897842321,
	"post_attention_layernorm.weight": 0.33720525286504943
	},
	"metric": "dwce"
	},
	{
	"num_batches": 64,
	"token_count": 131072.0,
	"norm": "relative",
	"supports_kwargs": true,
	"fuse_priors": {
	"self_attn.q_proj.weight": 0.9961265181629947,
	"self_attn.k_proj.weight": 0.6652207081963631,
	"self_attn.v_proj.weight": 0.6393890973156684,
	"self_attn.o_proj.weight": 0.5993613584307386,
	"mlp.gate_proj.weight": 0.3655389632232587,
	"mlp.up_proj.weight": 0.5096844599903007,
	"mlp.down_proj.weight": 0.4675655218682354,
	"input_layernorm.weight": 0.919265488845913,
	"post_attention_layernorm.weight": 0.21712004107502894
	},
	"metric": "dwce"
	},
	{
	"excluded": true
	}
	],
	"supports_kwargs": true,
	"max_batches": 0,
	"norm": "relative",
	"metric": "dwce",
	"cosine_topk": 3,
	"start_index": 0,
	"excluded_pairs": [
	0,
	1,
	20
	]
	},
	"fisher_num_batches": 64,
	"merge_method": "reparam",
	"merged_params": 3,
	"num_sequences": 128,
	"teacher_source": "previous_cycle_memory",
	"teacher_cycle": 10,
	"eval": {
	"datasets": [
	"wikitext"
	],
	"configs": [
	"wikitext-2-raw-v1"
	],
	"split": "test",
	"num_samples": 200,
	"seq_len": 1024,
	"post_ppl": {
	"wikitext:wikitext-2-raw-v1": 38.01892516739481
	}
	},
	"comm": {
	"enabled": true,
	"train_mode": "lora",
	"opt_steps": 6,
	"grad_accum_steps": 1,
	"lr": 1e-05,
	"temp": 2.0,
	"steps_ratio": 0.1,
	"lr_scale": 0.1,
	"interaction_mode": "relative",
	"interaction_eps": 1e-08,
	"mu": 0.5,
	"mu_auto": true,
	"mu_auto_rho": 0.1,
	"mu_auto_eps": 1e-08,
	"sample_eta": 0.5,
	"sample_dwce_scale": 1.0,
	"topk": 1,
	"candidate_pairs": [
	16
	],
	"trainable_params": 14417920,
	"pair_selection": {
	"num_pairs": 21,
	"excluded_pairs": [
	0,
	1,
	20
	],
	"candidate_pairs": [
	16
	],
	"total_samples": 6,
	"unique_pairs": 1,
	"counts": [
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	6,
	0,
	0,
	0,
	0
	],
	"freqs": [
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	1.0,
	0.0,
	0.0,
	0.0,
	0.0
	],
	"probs": [
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	0.0,
	1.0,
	0.0,
	0.0,
	0.0,
	0.0
	],
	"top_pairs": [
	{
	"pair": 16,
	"count": 6,
	"freq": 1.0,
	"prob": 1.0
	}
	]
	},
	"avg_loss": 0.0004086560936388632,
	"avg_anchor": 0.00023934131847151244,
	"avg_interaction": 0.0413870836297671,
	"avg_mu": 0.0037876796102222893,
	"teacher_source": "previous_cycle_memory",
	"teacher_cycle": 10,
	"dwce_scores_pre": [
	Infinity,
	Infinity,
	0.08755428350373622,
	0.09632033766022144,
	0.12102187652892513,
	0.07687722047289398,
	0.07646200702022135,
	0.10762881995392454,
	0.11769039904744791,
	0.13281737352618223,
	0.13359121989702824,
	0.12213174133312057,
	0.09479378554179792,
	0.0702238861787414,
	0.09607953229105111,
	0.1022260119464429,
	0.04662290344732959,
	0.08633913939177026,
	0.14327522026540693,
	0.14370887781314598,
	Infinity
	],
	"post_ppl": {
	"wikitext:wikitext-2-raw-v1": 34.96986894820446
	},
	"post_selection_recomputed": false,
	"selected_layer_post": 16
	},
	"distill": {
	"enabled": true,
	"method": "reparam",
	"calib_samples": 256,
	"inst_samples": 0,
	"seq_len": 1024,
	"batch_size": 1,
	"epochs": 1.0,
	"lr": 0.0001,
	"kl_weight": 0.02,
	"kl_temp": 4.0,
	"hidden_mse_weight": 1.0,
	"attn_mse_weight": 0.25,
	"mlp_mse_weight": 1.0,
	"reparam_eta": 0.0,
	"reparam_gamma": 0.0,
	"reparam_attn_reg_scale": 1.0,
	"reparam_mlp_reg_scale": 1.0,
	"reparam_param_subset": "mlp",
	"reparam_stats": {
	"enabled": true,
	"epochs": 1.0,
	"lr": 0.0001,
	"hidden_mse_weight": 1.0,
	"attn_mse_weight": 0.25,
	"mlp_mse_weight": 1.0,
	"eta": 0.0,
	"gamma": 0.0,
	"attn_reg_scale": 1.0,
	"mlp_reg_scale": 1.0,
	"param_subset": "mlp",
	"num_gates": 3,
	"num_attn_gates": 0,
	"num_mlp_gates": 3,
	"num_other_gates": 0,
	"lambda_init": "fisher_prior"
	},
	"reparam_gate_summary": {
	"num_tensors": 3,
	"num_elements": 176160768,
	"global_mean": 0.5134112267267137,
	"per_tensor_mean": {
	"mlp.gate_proj.weight": 0.5045510530471802,
	"mlp.up_proj.weight": 0.5129446983337402,
	"mlp.down_proj.weight": 0.522737979888916
	}
	},
	"post_ppl": {
	"wikitext:wikitext-2-raw-v1": 38.01892516739481
	},
	"weight_decay": 0.0,
	"max_grad_norm": 1.0,
	"grad_accum_steps": 1,
	"instruction_dataset": null,
	"instruction_config": null,
	"instruction_split": "train"
	},
	"lora": {
	"enabled": false,
	"seq_len": 1024,
	"batch_size": 1,
	"epochs": 0.0,
	"rank": 8,
	"alpha": 16.0,
	"dropout": 0.0,
	"target_modules": [
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"down_proj",
	"up_proj"
	],
	"respect_exclude_pairs": false,
	"kl_enabled": false,
	"kl_weight": 0.1,
	"kl_temp": 4.0,
	"post_ppl": null,
	"lr": 0.0001,
	"weight_decay": 0.0,
	"max_grad_norm": 1.0,
	"grad_accum_steps": 1,
	"log_steps": 100,
	"eval_every": 2000,
	"eval_max_batches": null
	},
	"norm_policy": "hybrid",
	"full_model_saved": true,
	"full_model": {
	"stage": "cycle_11_full_model",
	"path": "llama3_mlp_skipdwce_nopermute/cycle_11/full_model",
	"weight_bytes": 11279358471,
	"post_ppl": {
	"wikitext:wikitext-2-raw-v1": 38.01892516739481
	},
	"resume_info": "resume_info.json"
	}
	}